OpScanIA

Sleeping

App Files Files Community

jorgeiv500 commited on Nov 12, 2025

Commit

6bee325

verified ·

1 Parent(s): 930b9fe

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -29

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
-# app.py — DeepSeek-OCR + BioMedLM con ZeroGPU-safe y fallback remoto — Gradio 5
-import os, tempfile, traceback
 import gradio as gr
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
 import spaces
 from huggingface_hub import InferenceClient
 # ===============================================================
 # CONFIG (env)
@@ -13,12 +14,16 @@ from huggingface_hub import InferenceClient
 BIO_REMOTE = os.getenv("BIO_REMOTE", "1") == "1"          # Recomendado en Spaces ZeroGPU
 BIO_MODEL_ID = os.getenv("BIO_MODEL_ID", "stanford-crfm/BioMedLM").strip()
 HF_TOKEN = os.getenv("HF_TOKEN")
-BIO_FALLBACK_REMOTE = os.getenv("BIO_FALLBACK_REMOTE", "1") == "1"  # Si falla local => intenta remoto
 GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.2"))
 GEN_TOP_P = float(os.getenv("GEN_TOP_P", "0.9"))
 GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
 GEN_REP_PENALTY = float(os.getenv("GEN_REP_PENALTY", "1.1"))
 # Caches (no tocan CUDA en el proceso principal)
 _hf_client = None
@@ -71,10 +76,82 @@ def get_biomedlm():
     global _hf_client
     if BIO_REMOTE:
         if _hf_client is None:
-            _hf_client = InferenceClient(model=BIO_MODEL_ID, token=HF_TOKEN)
         return ("remote", _hf_client)
     return ("local", None)
 @spaces.GPU
 def biomedlm_infer_local(prompt: str,
                          temperature=0.2,
@@ -118,7 +195,6 @@ def biomedlm_infer_local(prompt: str,
         return "OK::" + text.strip()
     except Exception as e:
-        # Devolver mensaje de error rico (no levantar excepción para que ZeroGPU no lo opaque)
         err_cls = e.__class__.__name__
         return f"ERR::[{err_cls}] {str(e) or repr(e)}"
@@ -129,24 +205,16 @@ def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
             user_msg = "Analiza el CONTEXTO_OCR anterior y responde a partir de ese contenido."
         prompt = build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg)
-        mode, handle = get_biomedlm()
-        # Preferido: remoto (evita límites ZeroGPU)
         if mode == "remote":
-            out = handle.text_generation(
-                prompt,
-                max_new_tokens=GEN_MAX_NEW_TOKENS,
-                temperature=GEN_TEMPERATURE,
-                top_p=GEN_TOP_P,
-                repetition_penalty=GEN_REP_PENALTY,
-                stop_sequences=["\nUser:", "### System", "### Context", "### Conversation"]
-            )
-            answer = out.strip() if isinstance(out, str) else str(out)
             updated = (chat_msgs or []) + [
                 {"role": "user", "content": user_msg},
                 {"role": "assistant", "content": answer}
             ]
-            return updated, "", gr.update(value="")
         # Local (ZeroGPU)
         res = biomedlm_infer_local(
@@ -170,22 +238,12 @@ def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
             # Fallback automático a remoto si está permitido
             if BIO_FALLBACK_REMOTE:
-                mode2, handle2 = ("remote", InferenceClient(model=BIO_MODEL_ID, token=HF_TOKEN))
-                out2 = handle2.text_generation(
-                    prompt,
-                    max_new_tokens=GEN_MAX_NEW_TOKENS,
-                    temperature=GEN_TEMPERATURE,
-                    top_p=GEN_TOP_P,
-                    repetition_penalty=GEN_REP_PENALTY,
-                    stop_sequences=["\nUser:", "### System", "### Context", "### Conversation"]
-                )
-                answer2 = out2.strip() if isinstance(out2, str) else str(out2)
                 updated = (chat_msgs or []) + [
                     {"role": "user", "content": user_msg},
                     {"role": "assistant", "content": answer2}
                 ]
-                # Enviar detalle al panel de debug
-                return updated, "", gr.update(value=f"[Local->Remoto fallback]\n{err_msg}")
             else:
                 updated = (chat_msgs or []) + [
                     {"role": "user", "content": user_msg},

+# app.py — DeepSeek-OCR + BioMedLM con fixes para StopIteration (HF) y ZeroGPU — Gradio 5
+import os, tempfile, traceback, json
 import gradio as gr
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
 import spaces
 from huggingface_hub import InferenceClient
+import requests  # Fallback HTTP directo a HF si falla InferenceClient
 # ===============================================================
 # CONFIG (env)
 BIO_REMOTE = os.getenv("BIO_REMOTE", "1") == "1"          # Recomendado en Spaces ZeroGPU
 BIO_MODEL_ID = os.getenv("BIO_MODEL_ID", "stanford-crfm/BioMedLM").strip()
 HF_TOKEN = os.getenv("HF_TOKEN")
+HF_PROVIDER = os.getenv("HF_PROVIDER", "hf-inference").strip()  # fuerza proveedor y evita StopIteration
+BIO_FALLBACK_REMOTE = os.getenv("BIO_FALLBACK_REMOTE", "1") == "1"  # Si local falla => intenta remoto
 GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.2"))
 GEN_TOP_P = float(os.getenv("GEN_TOP_P", "0.9"))
 GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
 GEN_REP_PENALTY = float(os.getenv("GEN_REP_PENALTY", "1.1"))
+GEN_TIMEOUT = int(os.getenv("GEN_TIMEOUT", "60"))  # seg. para llamadas remotas
+STOP_SEQS = ["\nUser:", "### System", "### Context", "### Conversation"]
 # Caches (no tocan CUDA en el proceso principal)
 _hf_client = None
     global _hf_client
     if BIO_REMOTE:
         if _hf_client is None:
+            # Fuerza provider para evitar StopIteration en algunas versiones de huggingface_hub
+            _hf_client = InferenceClient(model=BIO_MODEL_ID, token=HF_TOKEN, provider=HF_PROVIDER)
         return ("remote", _hf_client)
     return ("local", None)
+def _hf_text_generation_raw(model_id: str, prompt: str,
+                            temperature: float, top_p: float, rep_penalty: float,
+                            max_new_tokens: int, stop: list, timeout: int) -> str:
+    """
+    Fallback directo a la API de Inference (HTTP) si falla InferenceClient.text_generation
+    Maneja respuestas tanto de serverless como TGI.
+    """
+    url = f"https://api-inference.huggingface.co/models/{model_id}"
+    headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "repetition_penalty": rep_penalty,
+            "stop": stop,
+            "return_full_text": False
+        },
+        "options": {"use_cache": False, "wait_for_model": True}
+    }
+    r = requests.post(url, headers=headers, json=payload, timeout=timeout)
+    if r.status_code == 200:
+        data = r.json()
+        # Respuesta puede ser lista con {generated_text} o dict TGI-like
+        if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
+            return data[0]["generated_text"]
+        # Algunas variantes devuelven dict con 'generated_text' o 'text'
+        if isinstance(data, dict):
+            if "generated_text" in data:
+                return data["generated_text"]
+            if "text" in data:
+                return data["text"]
+        # Fallback a string
+        return json.dumps(data)[:4000]
+    else:
+        raise RuntimeError(f"HTTP {r.status_code}: {r.text[:1000]}")
+def call_biomedlm_remote(prompt: str) -> (str, str):
+    """
+    Intenta usar InferenceClient.text_generation; si levanta StopIteration/otros,
+    cae a HTTP raw. Retorna (respuesta, debug_msg)
+    """
+    client = get_biomedlm()[1]
+    try:
+        out = client.text_generation(
+            prompt,
+            max_new_tokens=GEN_MAX_NEW_TOKENS,
+            temperature=GEN_TEMPERATURE,
+            top_p=GEN_TOP_P,
+            repetition_penalty=GEN_REP_PENALTY,
+            stop_sequences=STOP_SEQS,
+            details=False,  # mantener string plano
+            stream=False,
+            timeout=GEN_TIMEOUT,
+        )
+        answer = out.strip() if isinstance(out, str) else str(out)
+        return answer, ""
+    except Exception as e:
+        # Fallback a HTTP
+        try:
+            answer = _hf_text_generation_raw(
+                BIO_MODEL_ID, prompt,
+                GEN_TEMPERATURE, GEN_TOP_P, GEN_REP_PENALTY,
+                GEN_MAX_NEW_TOKENS, STOP_SEQS, GEN_TIMEOUT
+            ).strip()
+            dbg = f"[Fallback HTTP HF] {e.__class__.__name__}: {str(e) or repr(e)}"
+            return answer, dbg
+        except Exception as e2:
+            raise RuntimeError(f"Remote generation failed: {e.__class__.__name__}: {e} | HTTP fallback: {e2.__class__.__name__}: {e2}")
 @spaces.GPU
 def biomedlm_infer_local(prompt: str,
                          temperature=0.2,
         return "OK::" + text.strip()
     except Exception as e:
         err_cls = e.__class__.__name__
         return f"ERR::[{err_cls}] {str(e) or repr(e)}"
             user_msg = "Analiza el CONTEXTO_OCR anterior y responde a partir de ese contenido."
         prompt = build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg)
+        mode, _handle = get_biomedlm()
+        # Preferido: remoto (evita límites ZeroGPU y CUDA en main)
         if mode == "remote":
+            answer, dbg = call_biomedlm_remote(prompt)
             updated = (chat_msgs or []) + [
                 {"role": "user", "content": user_msg},
                 {"role": "assistant", "content": answer}
             ]
+            return updated, "", gr.update(value=dbg)
         # Local (ZeroGPU)
         res = biomedlm_infer_local(
             # Fallback automático a remoto si está permitido
             if BIO_FALLBACK_REMOTE:
+                answer2, dbg2 = call_biomedlm_remote(prompt)
                 updated = (chat_msgs or []) + [
                     {"role": "user", "content": user_msg},
                     {"role": "assistant", "content": answer2}
                 ]
+                return updated, "", gr.update(value=f"[Local->Remoto fallback]\n{err_msg}\n{dbg2}")
             else:
                 updated = (chat_msgs or []) + [
                     {"role": "user", "content": user_msg},