Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# app.py — DeepSeek-OCR + BioMedLM (
|
| 2 |
import os, tempfile, traceback, json
|
| 3 |
import gradio as gr
|
| 4 |
import torch
|
|
@@ -14,8 +14,12 @@ import requests
|
|
| 14 |
BIO_REMOTE = os.getenv("BIO_REMOTE", "1") == "1" # recomendado en Spaces ZeroGPU
|
| 15 |
BIO_MODEL_ID = os.getenv("BIO_MODEL_ID", "stanford-crfm/BioMedLM").strip()
|
| 16 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 17 |
-
HF_PROVIDER = os.getenv("HF_PROVIDER", "hf-inference").strip()
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.2"))
|
| 20 |
GEN_TOP_P = float(os.getenv("GEN_TOP_P", "0.9"))
|
| 21 |
GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
|
|
@@ -65,40 +69,36 @@ def build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg):
|
|
| 65 |
return prompt
|
| 66 |
|
| 67 |
# =========================
|
| 68 |
-
# BioMedLM remoto/local
|
| 69 |
# =========================
|
| 70 |
def get_biomedlm():
|
| 71 |
"""Decidir modo. No tocar CUDA aquí."""
|
| 72 |
global _hf_client
|
| 73 |
if BIO_REMOTE:
|
| 74 |
if _hf_client is None:
|
| 75 |
-
# timeout va en el constructor
|
| 76 |
_hf_client = InferenceClient(
|
| 77 |
model=BIO_MODEL_ID,
|
| 78 |
-
provider=HF_PROVIDER,
|
| 79 |
token=HF_TOKEN,
|
| 80 |
-
timeout=GEN_TIMEOUT,
|
| 81 |
)
|
| 82 |
return ("remote", _hf_client)
|
| 83 |
return ("local", None)
|
| 84 |
|
| 85 |
-
def
|
| 86 |
-
"""Fallback HTTP al router HF (
|
| 87 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
|
| 88 |
payload = {
|
| 89 |
"model": BIO_MODEL_ID,
|
| 90 |
-
"
|
| 91 |
"max_tokens": GEN_MAX_NEW_TOKENS,
|
| 92 |
"temperature": GEN_TEMPERATURE,
|
| 93 |
"top_p": GEN_TOP_P,
|
| 94 |
"stop": STOP_SEQS,
|
| 95 |
}
|
| 96 |
-
|
| 97 |
-
# 1) ruta OpenAI-compat
|
| 98 |
urls = [
|
| 99 |
-
"https://router.huggingface.co/v1/
|
| 100 |
-
|
| 101 |
-
"https://router.huggingface.co/hf-inference/v1/chat/completions",
|
| 102 |
]
|
| 103 |
last_exc = None
|
| 104 |
for url in urls:
|
|
@@ -106,39 +106,43 @@ def _hf_http_chat(prompt: str) -> str:
|
|
| 106 |
r = requests.post(url, headers=headers, json=payload, timeout=GEN_TIMEOUT)
|
| 107 |
if r.status_code == 200:
|
| 108 |
data = r.json()
|
| 109 |
-
# OpenAI-like
|
| 110 |
if isinstance(data, dict) and "choices" in data and data["choices"]:
|
| 111 |
-
|
| 112 |
-
return (msg.get("content") or "").strip()
|
| 113 |
return json.dumps(data)[:4000]
|
| 114 |
-
# si 410 en api vieja, seguir intentando
|
| 115 |
last_exc = RuntimeError(f"HTTP {r.status_code}: {r.text[:800]}")
|
| 116 |
except Exception as e:
|
| 117 |
last_exc = e
|
| 118 |
-
raise last_exc or RuntimeError("HF router error")
|
| 119 |
|
| 120 |
def call_biomedlm_remote(prompt: str) -> (str, str):
|
| 121 |
"""
|
| 122 |
-
Usa
|
|
|
|
| 123 |
Retorna (respuesta, debug_msg)
|
| 124 |
"""
|
| 125 |
client = get_biomedlm()[1]
|
| 126 |
try:
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
max_tokens=GEN_MAX_NEW_TOKENS,
|
| 131 |
temperature=GEN_TEMPERATURE,
|
| 132 |
top_p=GEN_TOP_P,
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
| 134 |
)
|
| 135 |
-
|
|
|
|
| 136 |
return answer, ""
|
| 137 |
except Exception as e:
|
| 138 |
-
|
|
|
|
|
|
|
| 139 |
try:
|
| 140 |
-
answer =
|
| 141 |
-
return answer, f"[Fallback HTTP router] {e.__class__.__name__}: {e}"
|
| 142 |
except Exception as e2:
|
| 143 |
raise RuntimeError(
|
| 144 |
f"Remote generation failed: {e.__class__.__name__}: {e} | HTTP fallback: {e2.__class__.__name__}: {e2}"
|
|
@@ -191,14 +195,36 @@ def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
|
|
| 191 |
|
| 192 |
mode, _ = get_biomedlm()
|
| 193 |
if mode == "remote":
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
res = biomedlm_infer_local(
|
| 203 |
prompt,
|
| 204 |
temperature=GEN_TEMPERATURE,
|
|
@@ -215,13 +241,11 @@ def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
|
|
| 215 |
return updated, "", gr.update(value="")
|
| 216 |
else:
|
| 217 |
err_msg = res[5:] if res.startswith("ERR::") else res
|
| 218 |
-
# fallback a remoto si se permite
|
| 219 |
-
answer2, dbg2 = call_biomedlm_remote(prompt)
|
| 220 |
updated = (chat_msgs or []) + [
|
| 221 |
{"role": "user", "content": user_msg},
|
| 222 |
-
{"role": "assistant", "content":
|
| 223 |
]
|
| 224 |
-
return updated, "", gr.update(value=
|
| 225 |
|
| 226 |
except Exception as e:
|
| 227 |
err = f"{e.__class__.__name__}: {str(e) or repr(e)}"
|
|
|
|
| 1 |
+
# app.py — DeepSeek-OCR + BioMedLM (text_generation remoto + ZeroGPU-safe local) — Gradio 5
|
| 2 |
import os, tempfile, traceback, json
|
| 3 |
import gradio as gr
|
| 4 |
import torch
|
|
|
|
| 14 |
BIO_REMOTE = os.getenv("BIO_REMOTE", "1") == "1" # recomendado en Spaces ZeroGPU
|
| 15 |
BIO_MODEL_ID = os.getenv("BIO_MODEL_ID", "stanford-crfm/BioMedLM").strip()
|
| 16 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
|
|
| 17 |
|
| 18 |
+
# Fallbacks
|
| 19 |
+
BIO_FALLBACK_HTTP = os.getenv("BIO_FALLBACK_HTTP", "1") == "1" # si InferenceClient falla => router HTTP
|
| 20 |
+
BIO_FALLBACK_LOCAL = os.getenv("BIO_FALLBACK_LOCAL", "1") == "1" # si todo remoto falla => intenta local GPU
|
| 21 |
+
|
| 22 |
+
# Parámetros de generación
|
| 23 |
GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.2"))
|
| 24 |
GEN_TOP_P = float(os.getenv("GEN_TOP_P", "0.9"))
|
| 25 |
GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
|
|
|
|
| 69 |
return prompt
|
| 70 |
|
| 71 |
# =========================
|
| 72 |
+
# BioMedLM remoto/local (NO CUDA en main)
|
| 73 |
# =========================
|
| 74 |
def get_biomedlm():
|
| 75 |
"""Decidir modo. No tocar CUDA aquí."""
|
| 76 |
global _hf_client
|
| 77 |
if BIO_REMOTE:
|
| 78 |
if _hf_client is None:
|
| 79 |
+
# timeout va en el constructor (no en la llamada)
|
| 80 |
_hf_client = InferenceClient(
|
| 81 |
model=BIO_MODEL_ID,
|
|
|
|
| 82 |
token=HF_TOKEN,
|
| 83 |
+
timeout=GEN_TIMEOUT,
|
| 84 |
)
|
| 85 |
return ("remote", _hf_client)
|
| 86 |
return ("local", None)
|
| 87 |
|
| 88 |
+
def _hf_http_completions(prompt: str) -> str:
|
| 89 |
+
"""Fallback HTTP al router HF (OpenAI-like /v1/completions)."""
|
| 90 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
|
| 91 |
payload = {
|
| 92 |
"model": BIO_MODEL_ID,
|
| 93 |
+
"prompt": prompt,
|
| 94 |
"max_tokens": GEN_MAX_NEW_TOKENS,
|
| 95 |
"temperature": GEN_TEMPERATURE,
|
| 96 |
"top_p": GEN_TOP_P,
|
| 97 |
"stop": STOP_SEQS,
|
| 98 |
}
|
|
|
|
|
|
|
| 99 |
urls = [
|
| 100 |
+
"https://router.huggingface.co/v1/completions",
|
| 101 |
+
"https://router.huggingface.co/hf-inference/v1/completions",
|
|
|
|
| 102 |
]
|
| 103 |
last_exc = None
|
| 104 |
for url in urls:
|
|
|
|
| 106 |
r = requests.post(url, headers=headers, json=payload, timeout=GEN_TIMEOUT)
|
| 107 |
if r.status_code == 200:
|
| 108 |
data = r.json()
|
| 109 |
+
# OpenAI completions-like
|
| 110 |
if isinstance(data, dict) and "choices" in data and data["choices"]:
|
| 111 |
+
return (data["choices"][0].get("text") or "").strip()
|
|
|
|
| 112 |
return json.dumps(data)[:4000]
|
|
|
|
| 113 |
last_exc = RuntimeError(f"HTTP {r.status_code}: {r.text[:800]}")
|
| 114 |
except Exception as e:
|
| 115 |
last_exc = e
|
| 116 |
+
raise last_exc or RuntimeError("HF router completions error")
|
| 117 |
|
| 118 |
def call_biomedlm_remote(prompt: str) -> (str, str):
|
| 119 |
"""
|
| 120 |
+
Usa InferenceClient.text_generation (task soportada por BioMedLM).
|
| 121 |
+
Si falla, cae a HTTP router /v1/completions.
|
| 122 |
Retorna (respuesta, debug_msg)
|
| 123 |
"""
|
| 124 |
client = get_biomedlm()[1]
|
| 125 |
try:
|
| 126 |
+
out = client.text_generation(
|
| 127 |
+
prompt=prompt,
|
| 128 |
+
max_new_tokens=GEN_MAX_NEW_TOKENS,
|
|
|
|
| 129 |
temperature=GEN_TEMPERATURE,
|
| 130 |
top_p=GEN_TOP_P,
|
| 131 |
+
repetition_penalty=GEN_REP_PENALTY,
|
| 132 |
+
stop_sequences=STOP_SEQS,
|
| 133 |
+
details=False,
|
| 134 |
+
stream=False,
|
| 135 |
)
|
| 136 |
+
# huggingface_hub devuelve str si details=False
|
| 137 |
+
answer = out.strip() if isinstance(out, str) else str(out)
|
| 138 |
return answer, ""
|
| 139 |
except Exception as e:
|
| 140 |
+
if not BIO_FALLBACK_HTTP:
|
| 141 |
+
raise
|
| 142 |
+
# Fallback HTTP al router nuevo (completions)
|
| 143 |
try:
|
| 144 |
+
answer = _hf_http_completions(prompt)
|
| 145 |
+
return answer, f"[Fallback HTTP router/completions] {e.__class__.__name__}: {e}"
|
| 146 |
except Exception as e2:
|
| 147 |
raise RuntimeError(
|
| 148 |
f"Remote generation failed: {e.__class__.__name__}: {e} | HTTP fallback: {e2.__class__.__name__}: {e2}"
|
|
|
|
| 195 |
|
| 196 |
mode, _ = get_biomedlm()
|
| 197 |
if mode == "remote":
|
| 198 |
+
try:
|
| 199 |
+
answer, dbg = call_biomedlm_remote(prompt)
|
| 200 |
+
updated = (chat_msgs or []) + [
|
| 201 |
+
{"role": "user", "content": user_msg},
|
| 202 |
+
{"role": "assistant", "content": answer}
|
| 203 |
+
]
|
| 204 |
+
return updated, "", gr.update(value=dbg)
|
| 205 |
+
except Exception as e_remote:
|
| 206 |
+
if not BIO_FALLBACK_LOCAL:
|
| 207 |
+
raise
|
| 208 |
+
# Fallback a local si remoto no disponible
|
| 209 |
+
res = biomedlm_infer_local(
|
| 210 |
+
prompt,
|
| 211 |
+
temperature=GEN_TEMPERATURE,
|
| 212 |
+
top_p=GEN_TOP_P,
|
| 213 |
+
rep_penalty=GEN_REP_PENALTY,
|
| 214 |
+
max_new_tokens=GEN_MAX_NEW_TOKENS
|
| 215 |
+
)
|
| 216 |
+
if res.startswith("OK::"):
|
| 217 |
+
answer = res[4:]
|
| 218 |
+
updated = (chat_msgs or []) + [
|
| 219 |
+
{"role": "user", "content": user_msg},
|
| 220 |
+
{"role": "assistant", "content": answer}
|
| 221 |
+
]
|
| 222 |
+
return updated, "", gr.update(value=f"[Remoto→Local] {e_remote}")
|
| 223 |
+
else:
|
| 224 |
+
err_msg = res[5:] if res.startswith("ERR::") else res
|
| 225 |
+
raise RuntimeError(f"Remote error: {e_remote} | Local error: {err_msg}")
|
| 226 |
+
|
| 227 |
+
# Modo local explícito
|
| 228 |
res = biomedlm_infer_local(
|
| 229 |
prompt,
|
| 230 |
temperature=GEN_TEMPERATURE,
|
|
|
|
| 241 |
return updated, "", gr.update(value="")
|
| 242 |
else:
|
| 243 |
err_msg = res[5:] if res.startswith("ERR::") else res
|
|
|
|
|
|
|
| 244 |
updated = (chat_msgs or []) + [
|
| 245 |
{"role": "user", "content": user_msg},
|
| 246 |
+
{"role": "assistant", "content": "⚠️ Error LLM (local). Revisa el panel de debug."}
|
| 247 |
]
|
| 248 |
+
return updated, "", gr.update(value=err_msg)
|
| 249 |
|
| 250 |
except Exception as e:
|
| 251 |
err = f"{e.__class__.__name__}: {str(e) or repr(e)}"
|