CharlieBonito commited on
Commit
4e37b96
·
verified ·
1 Parent(s): 66f6169

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -38
app.py CHANGED
@@ -30,12 +30,12 @@ def log(msg):
30
 
31
  def start_server():
32
  os.makedirs(MODEL_DIR, exist_ok=True)
33
- log("Descargando modelo (puede tardar varios minutos)...")
34
  try:
35
  m_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR)
36
- log(f"Modelo descargado en: {m_path}")
37
  mm_path = hf_hub_download(repo_id=MODEL_REPO, filename=MMPROJ_FILE, local_dir=MODEL_DIR)
38
- log(f"mmproj descargado en: {mm_path}")
39
  except Exception as e:
40
  log(f"FALLO en descarga: {e}")
41
  raise
@@ -45,30 +45,22 @@ def start_server():
45
 
46
  log(f"Binario OK: {LLAMA_SERVER}")
47
 
48
- # Chequeo rápido de dependencias (opcional)
49
- try:
50
- result = subprocess.run(["ldd", LLAMA_SERVER], capture_output=True, text=True)
51
- log("ldd output:\n" + result.stdout)
52
- except Exception:
53
- log("No se pudo ejecutar ldd, continuando...")
54
-
55
  env = os.environ.copy()
56
- # Dejar que el sistema maneje las librerías
57
- ld_path = "/usr/local/lib:/usr/local/cuda/lib64:/usr/local/nvidia/lib64:" + env.get("LD_LIBRARY_PATH", "")
58
- env["LD_LIBRARY_PATH"] = ld_path
59
 
 
60
  cmd = [
61
  LLAMA_SERVER,
62
  "-m", m_path,
63
- "--mmproj", mm_path, # Visión activada
64
  "--host", "127.0.0.1",
65
  "--port", "8080",
66
- "-c", "4096",
67
- "-ngl", "99",
68
- "-fa", "on", # Flash Attention
69
- "-np", "1",
70
- "-fit", "off",
71
- # Sin --no-mmap, el binario precompilado maneja bien mmap
72
  ]
73
  log(f"Lanzando: {' '.join(cmd)}")
74
  return subprocess.Popen(
@@ -77,32 +69,30 @@ def start_server():
77
  text=True, bufsize=1
78
  )
79
 
80
-
81
  def monitor_engine():
82
  global server_ready
83
  try:
84
- log("Arrancando monitor_engine...")
85
  proc = start_server()
86
- log(f"PID llama-server: {proc.pid}")
87
  for line in proc.stdout:
88
  line = line.strip()
89
  log(f"[llama] {line}")
90
  if "HTTP server listening" in line:
91
  server_ready = True
92
- log("MOTOR EN LINEA")
93
  ret = proc.wait()
94
- log(f"llama-server terminó con código: {ret}")
95
  except Exception as e:
96
- log(f"EXCEPCION MONITOR: {e}")
97
  log(traceback.format_exc())
98
 
99
-
100
  def respond(history):
101
  if not server_ready:
102
- yield "Motor cargando... aún no está listo. Consulta los logs para más información."
103
  return
104
 
105
- api_messages = [{"role": "system", "content": "Eres ClarityGuard. Responde de forma breve y amigable."}]
106
  for m in history:
107
  content = m["content"]
108
  if isinstance(content, list):
@@ -112,8 +102,8 @@ def respond(history):
112
  try:
113
  r = requests.post(
114
  f"{SERVER_URL}/v1/chat/completions",
115
- json={"messages": api_messages, "stream": True, "temperature": 0.2, "max_tokens": 512},
116
- stream=True, timeout=90
117
  )
118
  full_text = ""
119
  for line in r.iter_lines():
@@ -126,18 +116,17 @@ def respond(history):
126
  if chunk == "[DONE]":
127
  break
128
  try:
129
- delta = json.loads(chunk)["choices"][0].get("delta", {}).get("content", "")
130
  full_text += delta
131
  yield full_text
132
- except Exception:
133
  continue
134
  except Exception as e:
135
- yield f"Error: {e}"
136
-
137
 
138
  with gr.Blocks() as demo:
139
- gr.Markdown("# 🔍 ClarityGuard — Neuro-inclusive Communication Assistant")
140
- chatbot = gr.Chatbot(height=450)
141
  msg = gr.Textbox(placeholder="Escribe tu mensaje y presiona Enter...")
142
 
143
  def user_fn(message, history):
@@ -149,7 +138,7 @@ with gr.Blocks() as demo:
149
  def bot_fn(history):
150
  history.append({"role": "assistant", "content": ""})
151
  for chunk in respond(history[:-1]):
152
- history[-1] = {"role": "assistant", "content": chunk}
153
  yield history
154
 
155
  msg.submit(user_fn, [msg, chatbot], [msg, chatbot]).then(
 
30
 
31
  def start_server():
32
  os.makedirs(MODEL_DIR, exist_ok=True)
33
+ log("Descargando modelo...")
34
  try:
35
  m_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR)
36
+ log(f"Modelo en: {m_path}")
37
  mm_path = hf_hub_download(repo_id=MODEL_REPO, filename=MMPROJ_FILE, local_dir=MODEL_DIR)
38
+ log(f"mmproj en: {mm_path}")
39
  except Exception as e:
40
  log(f"FALLO en descarga: {e}")
41
  raise
 
45
 
46
  log(f"Binario OK: {LLAMA_SERVER}")
47
 
 
 
 
 
 
 
 
48
  env = os.environ.copy()
49
+ env["LD_LIBRARY_PATH"] = "/usr/local/lib:/usr/local/cuda/lib64:" + env.get("LD_LIBRARY_PATH", "")
 
 
50
 
51
+ # A100 con 80 GB VRAM → contexto amplio y Flash Attention
52
  cmd = [
53
  LLAMA_SERVER,
54
  "-m", m_path,
55
+ "--mmproj", mm_path,
56
  "--host", "127.0.0.1",
57
  "--port", "8080",
58
+ "-c", "16384", # 16k de contexto (sobra espacio)
59
+ "-ngl", "99", # todas las capas a GPU
60
+ "-fa", "on", # Flash Attention
61
+ "-np", "1", # un solo slot de inferencia
62
+ "-fit", "off", # desactivar ajuste automático
63
+ # mmap por defecto (rápido y fiable)
64
  ]
65
  log(f"Lanzando: {' '.join(cmd)}")
66
  return subprocess.Popen(
 
69
  text=True, bufsize=1
70
  )
71
 
 
72
  def monitor_engine():
73
  global server_ready
74
  try:
75
+ log("Arrancando monitor...")
76
  proc = start_server()
77
+ log(f"PID: {proc.pid}")
78
  for line in proc.stdout:
79
  line = line.strip()
80
  log(f"[llama] {line}")
81
  if "HTTP server listening" in line:
82
  server_ready = True
83
+ log("🔥 MOTOR EN LÍNEA (A100)")
84
  ret = proc.wait()
85
+ log(f"Servidor terminó con código: {ret}")
86
  except Exception as e:
87
+ log(f"EXCEPCIÓN MONITOR: {e}")
88
  log(traceback.format_exc())
89
 
 
90
  def respond(history):
91
  if not server_ready:
92
+ yield " Cargando el motor en la A100... espera unos segundos."
93
  return
94
 
95
+ api_messages = [{"role": "system", "content": "Eres ClarityGuard, un asistente neuroinclusivo de análisis de comunicación. Responde con empatía y precisión."}]
96
  for m in history:
97
  content = m["content"]
98
  if isinstance(content, list):
 
102
  try:
103
  r = requests.post(
104
  f"{SERVER_URL}/v1/chat/completions",
105
+ json={"messages": api_messages, "stream": True, "temperature": 0.2, "max_tokens": 1024},
106
+ stream=True, timeout=120
107
  )
108
  full_text = ""
109
  for line in r.iter_lines():
 
116
  if chunk == "[DONE]":
117
  break
118
  try:
119
+ delta = json.loads(chunk)["choices"][0]["delta"]["content"]
120
  full_text += delta
121
  yield full_text
122
+ except:
123
  continue
124
  except Exception as e:
125
+ yield f"Error: {e}"
 
126
 
127
  with gr.Blocks() as demo:
128
+ gr.Markdown("# 🔍 ClarityGuard — Asistente Neuroinclusivo (A100)")
129
+ chatbot = gr.Chatbot(height=500)
130
  msg = gr.Textbox(placeholder="Escribe tu mensaje y presiona Enter...")
131
 
132
  def user_fn(message, history):
 
138
  def bot_fn(history):
139
  history.append({"role": "assistant", "content": ""})
140
  for chunk in respond(history[:-1]):
141
+ history[-1]["content"] = chunk
142
  yield history
143
 
144
  msg.submit(user_fn, [msg, chatbot], [msg, chatbot]).then(