OpScanIA

Sleeping

App Files Files Community

jorgeiv500 commited on Nov 12, 2025

Commit

1cb9d27

verified ·

1 Parent(s): 4a2190b

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -41

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py — DeepSeek-OCR + BioMedLM (HF router fix + ZeroGPU-safe) — Gradio 5
 import os, tempfile, traceback, json
 import gradio as gr
 import torch
@@ -14,8 +14,12 @@ import requests
 BIO_REMOTE = os.getenv("BIO_REMOTE", "1") == "1"          # recomendado en Spaces ZeroGPU
 BIO_MODEL_ID = os.getenv("BIO_MODEL_ID", "stanford-crfm/BioMedLM").strip()
 HF_TOKEN = os.getenv("HF_TOKEN")
-HF_PROVIDER = os.getenv("HF_PROVIDER", "hf-inference").strip()
 GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.2"))
 GEN_TOP_P = float(os.getenv("GEN_TOP_P", "0.9"))
 GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
@@ -65,40 +69,36 @@ def build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg):
     return prompt
 # =========================
-# BioMedLM remoto/local
 # =========================
 def get_biomedlm():
     """Decidir modo. No tocar CUDA aquí."""
     global _hf_client
     if BIO_REMOTE:
         if _hf_client is None:
-            # timeout va en el constructor del cliente (no en text_generation)
             _hf_client = InferenceClient(
                 model=BIO_MODEL_ID,
-                provider=HF_PROVIDER,
                 token=HF_TOKEN,
-                timeout=GEN_TIMEOUT,   # ← así es correcto
             )
         return ("remote", _hf_client)
     return ("local", None)
-def _hf_http_chat(prompt: str) -> str:
-    """Fallback HTTP al router HF (dos rutas posibles)."""
     headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
     payload = {
         "model": BIO_MODEL_ID,
-        "messages": [{"role": "user", "content": prompt}],
         "max_tokens": GEN_MAX_NEW_TOKENS,
         "temperature": GEN_TEMPERATURE,
         "top_p": GEN_TOP_P,
         "stop": STOP_SEQS,
     }
-    # 1) ruta OpenAI-compat
     urls = [
-        "https://router.huggingface.co/v1/chat/completions",
-        # 2) algunos clientes piden prefijo /hf-inference
-        "https://router.huggingface.co/hf-inference/v1/chat/completions",
     ]
     last_exc = None
     for url in urls:
@@ -106,39 +106,43 @@ def _hf_http_chat(prompt: str) -> str:
             r = requests.post(url, headers=headers, json=payload, timeout=GEN_TIMEOUT)
             if r.status_code == 200:
                 data = r.json()
-                # OpenAI-like response
                 if isinstance(data, dict) and "choices" in data and data["choices"]:
-                    msg = data["choices"][0].get("message") or {}
-                    return (msg.get("content") or "").strip()
                 return json.dumps(data)[:4000]
-            # si 410 en api vieja, seguir intentando
             last_exc = RuntimeError(f"HTTP {r.status_code}: {r.text[:800]}")
         except Exception as e:
             last_exc = e
-    raise last_exc or RuntimeError("HF router error")
 def call_biomedlm_remote(prompt: str) -> (str, str):
     """
-    Usa chat.completions.create (OpenAI-like). Si falla, cae a HTTP router.
     Retorna (respuesta, debug_msg)
     """
     client = get_biomedlm()[1]
     try:
-        resp = client.chat.completions.create(
-            model=BIO_MODEL_ID,
-            messages=[{"role": "user", "content": prompt}],
-            max_tokens=GEN_MAX_NEW_TOKENS,
             temperature=GEN_TEMPERATURE,
             top_p=GEN_TOP_P,
-            stop=STOP_SEQS,
         )
-        answer = (resp.choices[0].message.content or "").strip()
         return answer, ""
     except Exception as e:
-        # Fallback HTTP al router nuevo
         try:
-            answer = _hf_http_chat(prompt)
-            return answer, f"[Fallback HTTP router] {e.__class__.__name__}: {e}"
         except Exception as e2:
             raise RuntimeError(
                 f"Remote generation failed: {e.__class__.__name__}: {e} | HTTP fallback: {e2.__class__.__name__}: {e2}"
@@ -191,14 +195,36 @@ def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
         mode, _ = get_biomedlm()
         if mode == "remote":
-            answer, dbg = call_biomedlm_remote(prompt)
-            updated = (chat_msgs or []) + [
-                {"role": "user", "content": user_msg},
-                {"role": "assistant", "content": answer}
-            ]
-            return updated, "", gr.update(value=dbg)
-        # Local (ZeroGPU)
         res = biomedlm_infer_local(
             prompt,
             temperature=GEN_TEMPERATURE,
@@ -215,13 +241,11 @@ def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
             return updated, "", gr.update(value="")
         else:
             err_msg = res[5:] if res.startswith("ERR::") else res
-            # fallback a remoto si se permite
-            answer2, dbg2 = call_biomedlm_remote(prompt)
             updated = (chat_msgs or []) + [
                 {"role": "user", "content": user_msg},
-                {"role": "assistant", "content": answer2}
             ]
-            return updated, "", gr.update(value=f"[Local->Remoto fallback]\n{err_msg}\n{dbg2}")
     except Exception as e:
         err = f"{e.__class__.__name__}: {str(e) or repr(e)}"

+# app.py — DeepSeek-OCR + BioMedLM (text_generation remoto + ZeroGPU-safe local) — Gradio 5
 import os, tempfile, traceback, json
 import gradio as gr
 import torch
 BIO_REMOTE = os.getenv("BIO_REMOTE", "1") == "1"          # recomendado en Spaces ZeroGPU
 BIO_MODEL_ID = os.getenv("BIO_MODEL_ID", "stanford-crfm/BioMedLM").strip()
 HF_TOKEN = os.getenv("HF_TOKEN")
+# Fallbacks
+BIO_FALLBACK_HTTP = os.getenv("BIO_FALLBACK_HTTP", "1") == "1"   # si InferenceClient falla => router HTTP
+BIO_FALLBACK_LOCAL = os.getenv("BIO_FALLBACK_LOCAL", "1") == "1" # si todo remoto falla => intenta local GPU
+# Parámetros de generación
 GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.2"))
 GEN_TOP_P = float(os.getenv("GEN_TOP_P", "0.9"))
 GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
     return prompt
 # =========================
+# BioMedLM remoto/local (NO CUDA en main)
 # =========================
 def get_biomedlm():
     """Decidir modo. No tocar CUDA aquí."""
     global _hf_client
     if BIO_REMOTE:
         if _hf_client is None:
+            # timeout va en el constructor (no en la llamada)
             _hf_client = InferenceClient(
                 model=BIO_MODEL_ID,
                 token=HF_TOKEN,
+                timeout=GEN_TIMEOUT,
             )
         return ("remote", _hf_client)
     return ("local", None)
+def _hf_http_completions(prompt: str) -> str:
+    """Fallback HTTP al router HF (OpenAI-like /v1/completions)."""
     headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
     payload = {
         "model": BIO_MODEL_ID,
+        "prompt": prompt,
         "max_tokens": GEN_MAX_NEW_TOKENS,
         "temperature": GEN_TEMPERATURE,
         "top_p": GEN_TOP_P,
         "stop": STOP_SEQS,
     }
     urls = [
+        "https://router.huggingface.co/v1/completions",
+        "https://router.huggingface.co/hf-inference/v1/completions",
     ]
     last_exc = None
     for url in urls:
             r = requests.post(url, headers=headers, json=payload, timeout=GEN_TIMEOUT)
             if r.status_code == 200:
                 data = r.json()
+                # OpenAI completions-like
                 if isinstance(data, dict) and "choices" in data and data["choices"]:
+                    return (data["choices"][0].get("text") or "").strip()
                 return json.dumps(data)[:4000]
             last_exc = RuntimeError(f"HTTP {r.status_code}: {r.text[:800]}")
         except Exception as e:
             last_exc = e
+    raise last_exc or RuntimeError("HF router completions error")
 def call_biomedlm_remote(prompt: str) -> (str, str):
     """
+    Usa InferenceClient.text_generation (task soportada por BioMedLM).
+    Si falla, cae a HTTP router /v1/completions.
     Retorna (respuesta, debug_msg)
     """
     client = get_biomedlm()[1]
     try:
+        out = client.text_generation(
+            prompt=prompt,
+            max_new_tokens=GEN_MAX_NEW_TOKENS,
             temperature=GEN_TEMPERATURE,
             top_p=GEN_TOP_P,
+            repetition_penalty=GEN_REP_PENALTY,
+            stop_sequences=STOP_SEQS,
+            details=False,
+            stream=False,
         )
+        # huggingface_hub devuelve str si details=False
+        answer = out.strip() if isinstance(out, str) else str(out)
         return answer, ""
     except Exception as e:
+        if not BIO_FALLBACK_HTTP:
+            raise
+        # Fallback HTTP al router nuevo (completions)
         try:
+            answer = _hf_http_completions(prompt)
+            return answer, f"[Fallback HTTP router/completions] {e.__class__.__name__}: {e}"
         except Exception as e2:
             raise RuntimeError(
                 f"Remote generation failed: {e.__class__.__name__}: {e} | HTTP fallback: {e2.__class__.__name__}: {e2}"
         mode, _ = get_biomedlm()
         if mode == "remote":
+            try:
+                answer, dbg = call_biomedlm_remote(prompt)
+                updated = (chat_msgs or []) + [
+                    {"role": "user", "content": user_msg},
+                    {"role": "assistant", "content": answer}
+                ]
+                return updated, "", gr.update(value=dbg)
+            except Exception as e_remote:
+                if not BIO_FALLBACK_LOCAL:
+                    raise
+                # Fallback a local si remoto no disponible
+                res = biomedlm_infer_local(
+                    prompt,
+                    temperature=GEN_TEMPERATURE,
+                    top_p=GEN_TOP_P,
+                    rep_penalty=GEN_REP_PENALTY,
+                    max_new_tokens=GEN_MAX_NEW_TOKENS
+                )
+                if res.startswith("OK::"):
+                    answer = res[4:]
+                    updated = (chat_msgs or []) + [
+                        {"role": "user", "content": user_msg},
+                        {"role": "assistant", "content": answer}
+                    ]
+                    return updated, "", gr.update(value=f"[Remoto→Local] {e_remote}")
+                else:
+                    err_msg = res[5:] if res.startswith("ERR::") else res
+                    raise RuntimeError(f"Remote error: {e_remote} | Local error: {err_msg}")
+        # Modo local explícito
         res = biomedlm_infer_local(
             prompt,
             temperature=GEN_TEMPERATURE,
             return updated, "", gr.update(value="")
         else:
             err_msg = res[5:] if res.startswith("ERR::") else res
             updated = (chat_msgs or []) + [
                 {"role": "user", "content": user_msg},
+                {"role": "assistant", "content": "⚠️ Error LLM (local). Revisa el panel de debug."}
             ]
+            return updated, "", gr.update(value=err_msg)
     except Exception as e:
         err = f"{e.__class__.__name__}: {str(e) or repr(e)}"