OpScanIA

Sleeping

App Files Files Community

jorgeiv500 commited on Nov 12, 2025

Commit

4a2190b

verified ·

1 Parent(s): 6bee325

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -151

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py — DeepSeek-OCR + BioMedLM con fixes para StopIteration (HF) y ZeroGPU — Gradio 5
 import os, tempfile, traceback, json
 import gradio as gr
 import torch
@@ -6,44 +6,40 @@ from PIL import Image
 from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
 import spaces
 from huggingface_hub import InferenceClient
-import requests  # Fallback HTTP directo a HF si falla InferenceClient
-# ===============================================================
 # CONFIG (env)
-# ===============================================================
-BIO_REMOTE = os.getenv("BIO_REMOTE", "1") == "1"          # Recomendado en Spaces ZeroGPU
 BIO_MODEL_ID = os.getenv("BIO_MODEL_ID", "stanford-crfm/BioMedLM").strip()
 HF_TOKEN = os.getenv("HF_TOKEN")
-HF_PROVIDER = os.getenv("HF_PROVIDER", "hf-inference").strip()  # fuerza proveedor y evita StopIteration
-BIO_FALLBACK_REMOTE = os.getenv("BIO_FALLBACK_REMOTE", "1") == "1"  # Si local falla => intenta remoto
 GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.2"))
 GEN_TOP_P = float(os.getenv("GEN_TOP_P", "0.9"))
 GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
 GEN_REP_PENALTY = float(os.getenv("GEN_REP_PENALTY", "1.1"))
-GEN_TIMEOUT = int(os.getenv("GEN_TIMEOUT", "60"))  # seg. para llamadas remotas
 STOP_SEQS = ["\nUser:", "### System", "### Context", "### Conversation"]
-# Caches (no tocan CUDA en el proceso principal)
 _hf_client = None
 _bio_local_cache = {"model": None, "tokenizer": None}
-# ===============================================================
-# PROMPTS / CHAT HELPERS
-# ===============================================================
-def _truncate(text, max_chars=3000):
-    return (text or "")[:max_chars]
 def _system_prompt():
     return ("Eres un asistente clínico educativo. No sustituyes el juicio médico. "
             "Usa CONTEXTO_OCR si existe; si falta, pídelo. Evita diagnósticos definitivos.")
-def _ocr_context(ocr_md, ocr_txt):
-    return _truncate(ocr_md) or _truncate(ocr_txt) or ""
 def build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg):
-    """Prompt estilo instruct para BioMedLM (no chat nativo)."""
     sys = _system_prompt()
     ctx = _ocr_context(ocr_md, ocr_txt)
@@ -68,89 +64,85 @@ def build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg):
     prompt += f"### Conversation\n{convo}\nAssistant:"
     return prompt
-# ===============================================================
-# BIO: remoto/local adapters (NO CUDA en main)
-# ===============================================================
 def get_biomedlm():
-    """Decide modo. No cargar modelos ni tocar CUDA aquí."""
     global _hf_client
     if BIO_REMOTE:
         if _hf_client is None:
-            # Fuerza provider para evitar StopIteration en algunas versiones de huggingface_hub
-            _hf_client = InferenceClient(model=BIO_MODEL_ID, token=HF_TOKEN, provider=HF_PROVIDER)
         return ("remote", _hf_client)
     return ("local", None)
-def _hf_text_generation_raw(model_id: str, prompt: str,
-                            temperature: float, top_p: float, rep_penalty: float,
-                            max_new_tokens: int, stop: list, timeout: int) -> str:
-    """
-    Fallback directo a la API de Inference (HTTP) si falla InferenceClient.text_generation
-    Maneja respuestas tanto de serverless como TGI.
-    """
-    url = f"https://api-inference.huggingface.co/models/{model_id}"
     headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
     payload = {
-        "inputs": prompt,
-        "parameters": {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
-            "repetition_penalty": rep_penalty,
-            "stop": stop,
-            "return_full_text": False
-        },
-        "options": {"use_cache": False, "wait_for_model": True}
     }
-    r = requests.post(url, headers=headers, json=payload, timeout=timeout)
-    if r.status_code == 200:
-        data = r.json()
-        # Respuesta puede ser lista con {generated_text} o dict TGI-like
-        if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
-            return data[0]["generated_text"]
-        # Algunas variantes devuelven dict con 'generated_text' o 'text'
-        if isinstance(data, dict):
-            if "generated_text" in data:
-                return data["generated_text"]
-            if "text" in data:
-                return data["text"]
-        # Fallback a string
-        return json.dumps(data)[:4000]
-    else:
-        raise RuntimeError(f"HTTP {r.status_code}: {r.text[:1000]}")
 def call_biomedlm_remote(prompt: str) -> (str, str):
     """
-    Intenta usar InferenceClient.text_generation; si levanta StopIteration/otros,
-    cae a HTTP raw. Retorna (respuesta, debug_msg)
     """
     client = get_biomedlm()[1]
     try:
-        out = client.text_generation(
-            prompt,
-            max_new_tokens=GEN_MAX_NEW_TOKENS,
             temperature=GEN_TEMPERATURE,
             top_p=GEN_TOP_P,
-            repetition_penalty=GEN_REP_PENALTY,
-            stop_sequences=STOP_SEQS,
-            details=False,  # mantener string plano
-            stream=False,
-            timeout=GEN_TIMEOUT,
         )
-        answer = out.strip() if isinstance(out, str) else str(out)
         return answer, ""
     except Exception as e:
-        # Fallback a HTTP
         try:
-            answer = _hf_text_generation_raw(
-                BIO_MODEL_ID, prompt,
-                GEN_TEMPERATURE, GEN_TOP_P, GEN_REP_PENALTY,
-                GEN_MAX_NEW_TOKENS, STOP_SEQS, GEN_TIMEOUT
-            ).strip()
-            dbg = f"[Fallback HTTP HF] {e.__class__.__name__}: {str(e) or repr(e)}"
-            return answer, dbg
         except Exception as e2:
-            raise RuntimeError(f"Remote generation failed: {e.__class__.__name__}: {e} | HTTP fallback: {e2.__class__.__name__}: {e2}")
 @spaces.GPU
 def biomedlm_infer_local(prompt: str,
@@ -158,26 +150,21 @@ def biomedlm_infer_local(prompt: str,
                          top_p=0.9,
                          rep_penalty=1.1,
                          max_new_tokens=512) -> str:
-    """Ejecución local en worker GPU; captura errores y los devuelve con prefijo ERR::"""
     try:
-        # Carga perezosa dentro del worker GPU
         if _bio_local_cache["model"] is None:
             tok = AutoTokenizer.from_pretrained(BIO_MODEL_ID, use_fast=True)
-            if torch.cuda.is_available():
-                dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
-            else:
-                dtype = torch.float32
             model = AutoModelForCausalLM.from_pretrained(BIO_MODEL_ID, torch_dtype=dtype)
             if torch.cuda.is_available():
                 model = model.to("cuda")
             _bio_local_cache["model"] = model.eval()
             _bio_local_cache["tokenizer"] = tok
         model = _bio_local_cache["model"]
         tok = _bio_local_cache["tokenizer"]
         inputs = tok(prompt, return_tensors="pt")
         if torch.cuda.is_available():
             inputs = {k: v.to("cuda") for k, v in inputs.items()}
@@ -193,21 +180,16 @@ def biomedlm_infer_local(prompt: str,
         )
         text = tok.decode(gen_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
         return "OK::" + text.strip()
     except Exception as e:
-        err_cls = e.__class__.__name__
-        return f"ERR::[{err_cls}] {str(e) or repr(e)}"
 def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
-    """Wrapper que decide remoto/local y maneja fallback + mensajes de error explícitos."""
     try:
         if not user_msg:
             user_msg = "Analiza el CONTEXTO_OCR anterior y responde a partir de ese contenido."
         prompt = build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg)
-        mode, _handle = get_biomedlm()
-        # Preferido: remoto (evita límites ZeroGPU y CUDA en main)
         if mode == "remote":
             answer, dbg = call_biomedlm_remote(prompt)
             updated = (chat_msgs or []) + [
@@ -224,7 +206,6 @@ def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
             rep_penalty=GEN_REP_PENALTY,
             max_new_tokens=GEN_MAX_NEW_TOKENS
         )
         if res.startswith("OK::"):
             answer = res[4:]
             updated = (chat_msgs or []) + [
@@ -233,23 +214,14 @@ def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
             ]
             return updated, "", gr.update(value="")
         else:
-            # Error local: mensaje detallado viene en res
             err_msg = res[5:] if res.startswith("ERR::") else res
-            # Fallback automático a remoto si está permitido
-            if BIO_FALLBACK_REMOTE:
-                answer2, dbg2 = call_biomedlm_remote(prompt)
-                updated = (chat_msgs or []) + [
-                    {"role": "user", "content": user_msg},
-                    {"role": "assistant", "content": answer2}
-                ]
-                return updated, "", gr.update(value=f"[Local->Remoto fallback]\n{err_msg}\n{dbg2}")
-            else:
-                updated = (chat_msgs or []) + [
-                    {"role": "user", "content": user_msg},
-                    {"role": "assistant", "content": "⚠️ Error LLM (local). Revisa el panel de debug."}
-                ]
-                return updated, "", gr.update(value=err_msg)
     except Exception as e:
         err = f"{e.__class__.__name__}: {str(e) or repr(e)}"
@@ -260,32 +232,24 @@ def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
         ]
         return updated, "", gr.update(value=f"{err}\n{tb}")
-def clear_chat():
-    return [], "", gr.update(value="")
-# ===============================================================
-# DeepSeek-OCR (intacto) con fallback si no hay FlashAttention2
-#   * NO CUDA hasta @spaces.GPU
-# ===============================================================
 def _load_ocr_model():
     model_name = "deepseek-ai/DeepSeek-OCR"
     ocr_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     attn_impl = os.getenv("OCR_ATTN_IMPL", "flash_attention_2")
     try:
         ocr_model = AutoModel.from_pretrained(
-            model_name,
-            _attn_implementation=attn_impl,
-            trust_remote_code=True,
-            use_safetensors=True
         ).eval()
         return ocr_tokenizer, ocr_model
     except Exception as e:
         if any(k in str(e).lower() for k in ["flash_attn", "flashattention2", "flash_attention_2"]):
             ocr_model = AutoModel.from_pretrained(
-                model_name,
-                _attn_implementation="eager",
-                trust_remote_code=True,
-                use_safetensors=True
             ).eval()
             return ocr_tokenizer, ocr_model
         raise
@@ -297,7 +261,6 @@ def process_image(image, model_size, task_type, is_eval_mode):
     if image is None:
         return None, "Please upload an image first.", "Please upload an image first."
-    # Mover a GPU SOLO dentro del worker
     if torch.cuda.is_available():
         dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
         model_device = model.to(dtype).to("cuda")
@@ -347,9 +310,9 @@ def process_image(image, model_size, task_type, is_eval_mode):
         text_result = plain_text_result if plain_text_result else markdown_content
         return result_image, markdown_content, text_result
-# ===============================================================
 # UI (Gradio 5)
-# ===============================================================
 with gr.Blocks(title="DeepSeek-OCR + BioMedLM", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
@@ -366,29 +329,18 @@ with gr.Blocks(title="DeepSeek-OCR + BioMedLM", theme=gr.themes.Soft()) as demo:
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard", "webcam"])
-            model_size = gr.Dropdown(
-                choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
-                value="Gundam (Recommended)",
-                label="Model Size"
-            )
-            task_type = gr.Dropdown(
-                choices=["Free OCR", "Convert to Markdown"],
-                value="Convert to Markdown",
-                label="Task Type"
-            )
-            eval_mode_checkbox = gr.Checkbox(
-                value=False,
-                label="Enable Evaluation Mode",
-                info="Solo texto (más rápido). Desmárcalo para ver imagen anotada y markdown."
-            )
             submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
             with gr.Tabs():
-                with gr.TabItem("Annotated Image"):
-                    output_image = gr.Image(interactive=False)
-                with gr.TabItem("Markdown Preview"):
-                    output_markdown = gr.Markdown()
                 with gr.TabItem("Markdown Source (or Eval Output)"):
                     output_text = gr.Textbox(lines=18, show_copy_button=True, interactive=False)
             with gr.Row():
@@ -416,11 +368,8 @@ with gr.Blocks(title="DeepSeek-OCR + BioMedLM", theme=gr.themes.Soft()) as demo:
         outputs=[ocr_md_state, ocr_txt_state, md_preview, txt_preview],
     )
-    send_btn.click(
-        fn=biomedlm_reply,
-        inputs=[user_in, chatbot, ocr_md_state, ocr_txt_state],
-        outputs=[chatbot, user_in, error_box]
-    )
     clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, error_box])
 if __name__ == "__main__":

+# app.py — DeepSeek-OCR + BioMedLM (HF router fix + ZeroGPU-safe) — Gradio 5
 import os, tempfile, traceback, json
 import gradio as gr
 import torch
 from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
 import spaces
 from huggingface_hub import InferenceClient
+import requests
+# =========================
 # CONFIG (env)
+# =========================
+BIO_REMOTE = os.getenv("BIO_REMOTE", "1") == "1"          # recomendado en Spaces ZeroGPU
 BIO_MODEL_ID = os.getenv("BIO_MODEL_ID", "stanford-crfm/BioMedLM").strip()
 HF_TOKEN = os.getenv("HF_TOKEN")
+HF_PROVIDER = os.getenv("HF_PROVIDER", "hf-inference").strip()
 GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.2"))
 GEN_TOP_P = float(os.getenv("GEN_TOP_P", "0.9"))
 GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
 GEN_REP_PENALTY = float(os.getenv("GEN_REP_PENALTY", "1.1"))
+GEN_TIMEOUT = int(os.getenv("GEN_TIMEOUT", "60"))  # s
 STOP_SEQS = ["\nUser:", "### System", "### Context", "### Conversation"]
+# Caches (sin tocar CUDA en el proceso principal)
 _hf_client = None
 _bio_local_cache = {"model": None, "tokenizer": None}
+# =========================
+# Prompt helpers
+# =========================
+def _truncate(text, max_chars=3000): return (text or "")[:max_chars]
 def _system_prompt():
     return ("Eres un asistente clínico educativo. No sustituyes el juicio médico. "
             "Usa CONTEXTO_OCR si existe; si falta, pídelo. Evita diagnósticos definitivos.")
+def _ocr_context(ocr_md, ocr_txt): return _truncate(ocr_md) or _truncate(ocr_txt) or ""
 def build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg):
     sys = _system_prompt()
     ctx = _ocr_context(ocr_md, ocr_txt)
     prompt += f"### Conversation\n{convo}\nAssistant:"
     return prompt
+# =========================
+# BioMedLM remoto/local
+# =========================
 def get_biomedlm():
+    """Decidir modo. No tocar CUDA aquí."""
     global _hf_client
     if BIO_REMOTE:
         if _hf_client is None:
+            # timeout va en el constructor del cliente (no en text_generation)
+            _hf_client = InferenceClient(
+                model=BIO_MODEL_ID,
+                provider=HF_PROVIDER,
+                token=HF_TOKEN,
+                timeout=GEN_TIMEOUT,   # ← así es correcto
+            )
         return ("remote", _hf_client)
     return ("local", None)
+def _hf_http_chat(prompt: str) -> str:
+    """Fallback HTTP al router HF (dos rutas posibles)."""
     headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
     payload = {
+        "model": BIO_MODEL_ID,
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": GEN_MAX_NEW_TOKENS,
+        "temperature": GEN_TEMPERATURE,
+        "top_p": GEN_TOP_P,
+        "stop": STOP_SEQS,
     }
+    # 1) ruta OpenAI-compat
+    urls = [
+        "https://router.huggingface.co/v1/chat/completions",
+        # 2) algunos clientes piden prefijo /hf-inference
+        "https://router.huggingface.co/hf-inference/v1/chat/completions",
+    ]
+    last_exc = None
+    for url in urls:
+        try:
+            r = requests.post(url, headers=headers, json=payload, timeout=GEN_TIMEOUT)
+            if r.status_code == 200:
+                data = r.json()
+                # OpenAI-like response
+                if isinstance(data, dict) and "choices" in data and data["choices"]:
+                    msg = data["choices"][0].get("message") or {}
+                    return (msg.get("content") or "").strip()
+                return json.dumps(data)[:4000]
+            # si 410 en api vieja, seguir intentando
+            last_exc = RuntimeError(f"HTTP {r.status_code}: {r.text[:800]}")
+        except Exception as e:
+            last_exc = e
+    raise last_exc or RuntimeError("HF router error")
 def call_biomedlm_remote(prompt: str) -> (str, str):
     """
+    Usa chat.completions.create (OpenAI-like). Si falla, cae a HTTP router.
+    Retorna (respuesta, debug_msg)
     """
     client = get_biomedlm()[1]
     try:
+        resp = client.chat.completions.create(
+            model=BIO_MODEL_ID,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=GEN_MAX_NEW_TOKENS,
             temperature=GEN_TEMPERATURE,
             top_p=GEN_TOP_P,
+            stop=STOP_SEQS,
         )
+        answer = (resp.choices[0].message.content or "").strip()
         return answer, ""
     except Exception as e:
+        # Fallback HTTP al router nuevo
         try:
+            answer = _hf_http_chat(prompt)
+            return answer, f"[Fallback HTTP router] {e.__class__.__name__}: {e}"
         except Exception as e2:
+            raise RuntimeError(
+                f"Remote generation failed: {e.__class__.__name__}: {e} | HTTP fallback: {e2.__class__.__name__}: {e2}"
+            )
 @spaces.GPU
 def biomedlm_infer_local(prompt: str,
                          top_p=0.9,
                          rep_penalty=1.1,
                          max_new_tokens=512) -> str:
+    """Ejecución local en worker GPU; devuelve OK:: o ERR::..."""
     try:
         if _bio_local_cache["model"] is None:
             tok = AutoTokenizer.from_pretrained(BIO_MODEL_ID, use_fast=True)
+            dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) else (
+                torch.float16 if torch.cuda.is_available() else torch.float32
+            )
             model = AutoModelForCausalLM.from_pretrained(BIO_MODEL_ID, torch_dtype=dtype)
             if torch.cuda.is_available():
                 model = model.to("cuda")
             _bio_local_cache["model"] = model.eval()
             _bio_local_cache["tokenizer"] = tok
         model = _bio_local_cache["model"]
         tok = _bio_local_cache["tokenizer"]
         inputs = tok(prompt, return_tensors="pt")
         if torch.cuda.is_available():
             inputs = {k: v.to("cuda") for k, v in inputs.items()}
         )
         text = tok.decode(gen_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
         return "OK::" + text.strip()
     except Exception as e:
+        return f"ERR::[{e.__class__.__name__}] {str(e) or repr(e)}"
 def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
     try:
         if not user_msg:
             user_msg = "Analiza el CONTEXTO_OCR anterior y responde a partir de ese contenido."
         prompt = build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg)
+        mode, _ = get_biomedlm()
         if mode == "remote":
             answer, dbg = call_biomedlm_remote(prompt)
             updated = (chat_msgs or []) + [
             rep_penalty=GEN_REP_PENALTY,
             max_new_tokens=GEN_MAX_NEW_TOKENS
         )
         if res.startswith("OK::"):
             answer = res[4:]
             updated = (chat_msgs or []) + [
             ]
             return updated, "", gr.update(value="")
         else:
             err_msg = res[5:] if res.startswith("ERR::") else res
+            # fallback a remoto si se permite
+            answer2, dbg2 = call_biomedlm_remote(prompt)
+            updated = (chat_msgs or []) + [
+                {"role": "user", "content": user_msg},
+                {"role": "assistant", "content": answer2}
+            ]
+            return updated, "", gr.update(value=f"[Local->Remoto fallback]\n{err_msg}\n{dbg2}")
     except Exception as e:
         err = f"{e.__class__.__name__}: {str(e) or repr(e)}"
         ]
         return updated, "", gr.update(value=f"{err}\n{tb}")
+def clear_chat(): return [], "", gr.update(value="")
+# =========================
+# DeepSeek-OCR (sin CUDA en main)
+# =========================
 def _load_ocr_model():
     model_name = "deepseek-ai/DeepSeek-OCR"
     ocr_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     attn_impl = os.getenv("OCR_ATTN_IMPL", "flash_attention_2")
     try:
         ocr_model = AutoModel.from_pretrained(
+            model_name, _attn_implementation=attn_impl, trust_remote_code=True, use_safetensors=True
         ).eval()
         return ocr_tokenizer, ocr_model
     except Exception as e:
         if any(k in str(e).lower() for k in ["flash_attn", "flashattention2", "flash_attention_2"]):
             ocr_model = AutoModel.from_pretrained(
+                model_name, _attn_implementation="eager", trust_remote_code=True, use_safetensors=True
             ).eval()
             return ocr_tokenizer, ocr_model
         raise
     if image is None:
         return None, "Please upload an image first.", "Please upload an image first."
     if torch.cuda.is_available():
         dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
         model_device = model.to(dtype).to("cuda")
         text_result = plain_text_result if plain_text_result else markdown_content
         return result_image, markdown_content, text_result
+# =========================
 # UI (Gradio 5)
+# =========================
 with gr.Blocks(title="DeepSeek-OCR + BioMedLM", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard", "webcam"])
+            model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
+                                     value="Gundam (Recommended)", label="Model Size")
+            task_type = gr.Dropdown(choices=["Free OCR", "Convert to Markdown"],
+                                    value="Convert to Markdown", label="Task Type")
+            eval_mode_checkbox = gr.Checkbox(value=False, label="Enable Evaluation Mode",
+                                             info="Solo texto (más rápido). Desmárcalo para ver imagen anotada y markdown.")
             submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
             with gr.Tabs():
+                with gr.TabItem("Annotated Image"): output_image = gr.Image(interactive=False)
+                with gr.TabItem("Markdown Preview"): output_markdown = gr.Markdown()
                 with gr.TabItem("Markdown Source (or Eval Output)"):
                     output_text = gr.Textbox(lines=18, show_copy_button=True, interactive=False)
             with gr.Row():
         outputs=[ocr_md_state, ocr_txt_state, md_preview, txt_preview],
     )
+    send_btn.click(fn=biomedlm_reply, inputs=[user_in, chatbot, ocr_md_state, ocr_txt_state],
+                   outputs=[chatbot, user_in, error_box])
     clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, error_box])
 if __name__ == "__main__":