jorgeiv500 commited on
Commit
1cb9d27
·
verified ·
1 Parent(s): 4a2190b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -41
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py — DeepSeek-OCR + BioMedLM (HF router fix + ZeroGPU-safe) — Gradio 5
2
  import os, tempfile, traceback, json
3
  import gradio as gr
4
  import torch
@@ -14,8 +14,12 @@ import requests
14
  BIO_REMOTE = os.getenv("BIO_REMOTE", "1") == "1" # recomendado en Spaces ZeroGPU
15
  BIO_MODEL_ID = os.getenv("BIO_MODEL_ID", "stanford-crfm/BioMedLM").strip()
16
  HF_TOKEN = os.getenv("HF_TOKEN")
17
- HF_PROVIDER = os.getenv("HF_PROVIDER", "hf-inference").strip()
18
 
 
 
 
 
 
19
  GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.2"))
20
  GEN_TOP_P = float(os.getenv("GEN_TOP_P", "0.9"))
21
  GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
@@ -65,40 +69,36 @@ def build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg):
65
  return prompt
66
 
67
  # =========================
68
- # BioMedLM remoto/local
69
  # =========================
70
  def get_biomedlm():
71
  """Decidir modo. No tocar CUDA aquí."""
72
  global _hf_client
73
  if BIO_REMOTE:
74
  if _hf_client is None:
75
- # timeout va en el constructor del cliente (no en text_generation)
76
  _hf_client = InferenceClient(
77
  model=BIO_MODEL_ID,
78
- provider=HF_PROVIDER,
79
  token=HF_TOKEN,
80
- timeout=GEN_TIMEOUT, # ← así es correcto
81
  )
82
  return ("remote", _hf_client)
83
  return ("local", None)
84
 
85
- def _hf_http_chat(prompt: str) -> str:
86
- """Fallback HTTP al router HF (dos rutas posibles)."""
87
  headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
88
  payload = {
89
  "model": BIO_MODEL_ID,
90
- "messages": [{"role": "user", "content": prompt}],
91
  "max_tokens": GEN_MAX_NEW_TOKENS,
92
  "temperature": GEN_TEMPERATURE,
93
  "top_p": GEN_TOP_P,
94
  "stop": STOP_SEQS,
95
  }
96
-
97
- # 1) ruta OpenAI-compat
98
  urls = [
99
- "https://router.huggingface.co/v1/chat/completions",
100
- # 2) algunos clientes piden prefijo /hf-inference
101
- "https://router.huggingface.co/hf-inference/v1/chat/completions",
102
  ]
103
  last_exc = None
104
  for url in urls:
@@ -106,39 +106,43 @@ def _hf_http_chat(prompt: str) -> str:
106
  r = requests.post(url, headers=headers, json=payload, timeout=GEN_TIMEOUT)
107
  if r.status_code == 200:
108
  data = r.json()
109
- # OpenAI-like response
110
  if isinstance(data, dict) and "choices" in data and data["choices"]:
111
- msg = data["choices"][0].get("message") or {}
112
- return (msg.get("content") or "").strip()
113
  return json.dumps(data)[:4000]
114
- # si 410 en api vieja, seguir intentando
115
  last_exc = RuntimeError(f"HTTP {r.status_code}: {r.text[:800]}")
116
  except Exception as e:
117
  last_exc = e
118
- raise last_exc or RuntimeError("HF router error")
119
 
120
  def call_biomedlm_remote(prompt: str) -> (str, str):
121
  """
122
- Usa chat.completions.create (OpenAI-like). Si falla, cae a HTTP router.
 
123
  Retorna (respuesta, debug_msg)
124
  """
125
  client = get_biomedlm()[1]
126
  try:
127
- resp = client.chat.completions.create(
128
- model=BIO_MODEL_ID,
129
- messages=[{"role": "user", "content": prompt}],
130
- max_tokens=GEN_MAX_NEW_TOKENS,
131
  temperature=GEN_TEMPERATURE,
132
  top_p=GEN_TOP_P,
133
- stop=STOP_SEQS,
 
 
 
134
  )
135
- answer = (resp.choices[0].message.content or "").strip()
 
136
  return answer, ""
137
  except Exception as e:
138
- # Fallback HTTP al router nuevo
 
 
139
  try:
140
- answer = _hf_http_chat(prompt)
141
- return answer, f"[Fallback HTTP router] {e.__class__.__name__}: {e}"
142
  except Exception as e2:
143
  raise RuntimeError(
144
  f"Remote generation failed: {e.__class__.__name__}: {e} | HTTP fallback: {e2.__class__.__name__}: {e2}"
@@ -191,14 +195,36 @@ def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
191
 
192
  mode, _ = get_biomedlm()
193
  if mode == "remote":
194
- answer, dbg = call_biomedlm_remote(prompt)
195
- updated = (chat_msgs or []) + [
196
- {"role": "user", "content": user_msg},
197
- {"role": "assistant", "content": answer}
198
- ]
199
- return updated, "", gr.update(value=dbg)
200
-
201
- # Local (ZeroGPU)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  res = biomedlm_infer_local(
203
  prompt,
204
  temperature=GEN_TEMPERATURE,
@@ -215,13 +241,11 @@ def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
215
  return updated, "", gr.update(value="")
216
  else:
217
  err_msg = res[5:] if res.startswith("ERR::") else res
218
- # fallback a remoto si se permite
219
- answer2, dbg2 = call_biomedlm_remote(prompt)
220
  updated = (chat_msgs or []) + [
221
  {"role": "user", "content": user_msg},
222
- {"role": "assistant", "content": answer2}
223
  ]
224
- return updated, "", gr.update(value=f"[Local->Remoto fallback]\n{err_msg}\n{dbg2}")
225
 
226
  except Exception as e:
227
  err = f"{e.__class__.__name__}: {str(e) or repr(e)}"
 
1
+ # app.py — DeepSeek-OCR + BioMedLM (text_generation remoto + ZeroGPU-safe local) — Gradio 5
2
  import os, tempfile, traceback, json
3
  import gradio as gr
4
  import torch
 
14
  BIO_REMOTE = os.getenv("BIO_REMOTE", "1") == "1" # recomendado en Spaces ZeroGPU
15
  BIO_MODEL_ID = os.getenv("BIO_MODEL_ID", "stanford-crfm/BioMedLM").strip()
16
  HF_TOKEN = os.getenv("HF_TOKEN")
 
17
 
18
+ # Fallbacks
19
+ BIO_FALLBACK_HTTP = os.getenv("BIO_FALLBACK_HTTP", "1") == "1" # si InferenceClient falla => router HTTP
20
+ BIO_FALLBACK_LOCAL = os.getenv("BIO_FALLBACK_LOCAL", "1") == "1" # si todo remoto falla => intenta local GPU
21
+
22
+ # Parámetros de generación
23
  GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.2"))
24
  GEN_TOP_P = float(os.getenv("GEN_TOP_P", "0.9"))
25
  GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
 
69
  return prompt
70
 
71
  # =========================
72
+ # BioMedLM remoto/local (NO CUDA en main)
73
  # =========================
74
  def get_biomedlm():
75
  """Decidir modo. No tocar CUDA aquí."""
76
  global _hf_client
77
  if BIO_REMOTE:
78
  if _hf_client is None:
79
+ # timeout va en el constructor (no en la llamada)
80
  _hf_client = InferenceClient(
81
  model=BIO_MODEL_ID,
 
82
  token=HF_TOKEN,
83
+ timeout=GEN_TIMEOUT,
84
  )
85
  return ("remote", _hf_client)
86
  return ("local", None)
87
 
88
+ def _hf_http_completions(prompt: str) -> str:
89
+ """Fallback HTTP al router HF (OpenAI-like /v1/completions)."""
90
  headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
91
  payload = {
92
  "model": BIO_MODEL_ID,
93
+ "prompt": prompt,
94
  "max_tokens": GEN_MAX_NEW_TOKENS,
95
  "temperature": GEN_TEMPERATURE,
96
  "top_p": GEN_TOP_P,
97
  "stop": STOP_SEQS,
98
  }
 
 
99
  urls = [
100
+ "https://router.huggingface.co/v1/completions",
101
+ "https://router.huggingface.co/hf-inference/v1/completions",
 
102
  ]
103
  last_exc = None
104
  for url in urls:
 
106
  r = requests.post(url, headers=headers, json=payload, timeout=GEN_TIMEOUT)
107
  if r.status_code == 200:
108
  data = r.json()
109
+ # OpenAI completions-like
110
  if isinstance(data, dict) and "choices" in data and data["choices"]:
111
+ return (data["choices"][0].get("text") or "").strip()
 
112
  return json.dumps(data)[:4000]
 
113
  last_exc = RuntimeError(f"HTTP {r.status_code}: {r.text[:800]}")
114
  except Exception as e:
115
  last_exc = e
116
+ raise last_exc or RuntimeError("HF router completions error")
117
 
118
  def call_biomedlm_remote(prompt: str) -> (str, str):
119
  """
120
+ Usa InferenceClient.text_generation (task soportada por BioMedLM).
121
+ Si falla, cae a HTTP router /v1/completions.
122
  Retorna (respuesta, debug_msg)
123
  """
124
  client = get_biomedlm()[1]
125
  try:
126
+ out = client.text_generation(
127
+ prompt=prompt,
128
+ max_new_tokens=GEN_MAX_NEW_TOKENS,
 
129
  temperature=GEN_TEMPERATURE,
130
  top_p=GEN_TOP_P,
131
+ repetition_penalty=GEN_REP_PENALTY,
132
+ stop_sequences=STOP_SEQS,
133
+ details=False,
134
+ stream=False,
135
  )
136
+ # huggingface_hub devuelve str si details=False
137
+ answer = out.strip() if isinstance(out, str) else str(out)
138
  return answer, ""
139
  except Exception as e:
140
+ if not BIO_FALLBACK_HTTP:
141
+ raise
142
+ # Fallback HTTP al router nuevo (completions)
143
  try:
144
+ answer = _hf_http_completions(prompt)
145
+ return answer, f"[Fallback HTTP router/completions] {e.__class__.__name__}: {e}"
146
  except Exception as e2:
147
  raise RuntimeError(
148
  f"Remote generation failed: {e.__class__.__name__}: {e} | HTTP fallback: {e2.__class__.__name__}: {e2}"
 
195
 
196
  mode, _ = get_biomedlm()
197
  if mode == "remote":
198
+ try:
199
+ answer, dbg = call_biomedlm_remote(prompt)
200
+ updated = (chat_msgs or []) + [
201
+ {"role": "user", "content": user_msg},
202
+ {"role": "assistant", "content": answer}
203
+ ]
204
+ return updated, "", gr.update(value=dbg)
205
+ except Exception as e_remote:
206
+ if not BIO_FALLBACK_LOCAL:
207
+ raise
208
+ # Fallback a local si remoto no disponible
209
+ res = biomedlm_infer_local(
210
+ prompt,
211
+ temperature=GEN_TEMPERATURE,
212
+ top_p=GEN_TOP_P,
213
+ rep_penalty=GEN_REP_PENALTY,
214
+ max_new_tokens=GEN_MAX_NEW_TOKENS
215
+ )
216
+ if res.startswith("OK::"):
217
+ answer = res[4:]
218
+ updated = (chat_msgs or []) + [
219
+ {"role": "user", "content": user_msg},
220
+ {"role": "assistant", "content": answer}
221
+ ]
222
+ return updated, "", gr.update(value=f"[Remoto→Local] {e_remote}")
223
+ else:
224
+ err_msg = res[5:] if res.startswith("ERR::") else res
225
+ raise RuntimeError(f"Remote error: {e_remote} | Local error: {err_msg}")
226
+
227
+ # Modo local explícito
228
  res = biomedlm_infer_local(
229
  prompt,
230
  temperature=GEN_TEMPERATURE,
 
241
  return updated, "", gr.update(value="")
242
  else:
243
  err_msg = res[5:] if res.startswith("ERR::") else res
 
 
244
  updated = (chat_msgs or []) + [
245
  {"role": "user", "content": user_msg},
246
+ {"role": "assistant", "content": "⚠️ Error LLM (local). Revisa el panel de debug."}
247
  ]
248
+ return updated, "", gr.update(value=err_msg)
249
 
250
  except Exception as e:
251
  err = f"{e.__class__.__name__}: {str(e) or repr(e)}"