OpScanIA

Sleeping

App Files Files Community

jorgeiv500 commited on Nov 12

Commit

7cb8c04

verified ·

1 Parent(s): 1cb9d27

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -268

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
-# app.py — DeepSeek-OCR + BioMedLM (text_generation remoto + ZeroGPU-safe local) — Gradio 5
-import os, tempfile, traceback, json
 import gradio as gr
 import torch
 from PIL import Image
-from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
 import spaces
 from huggingface_hub import InferenceClient
 import requests
@@ -11,119 +11,95 @@ import requests
 # =========================
 # CONFIG (env)
 # =========================
-BIO_REMOTE = os.getenv("BIO_REMOTE", "1") == "1"          # recomendado en Spaces ZeroGPU
-BIO_MODEL_ID = os.getenv("BIO_MODEL_ID", "stanford-crfm/BioMedLM").strip()
 HF_TOKEN = os.getenv("HF_TOKEN")
-# Fallbacks
-BIO_FALLBACK_HTTP = os.getenv("BIO_FALLBACK_HTTP", "1") == "1"   # si InferenceClient falla => router HTTP
-BIO_FALLBACK_LOCAL = os.getenv("BIO_FALLBACK_LOCAL", "1") == "1" # si todo remoto falla => intenta local GPU
-# Parámetros de generación
-GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.2"))
-GEN_TOP_P = float(os.getenv("GEN_TOP_P", "0.9"))
-GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
-GEN_REP_PENALTY = float(os.getenv("GEN_REP_PENALTY", "1.1"))
 GEN_TIMEOUT = int(os.getenv("GEN_TIMEOUT", "60"))  # s
-STOP_SEQS = ["\nUser:", "### System", "### Context", "### Conversation"]
-# Caches (sin tocar CUDA en el proceso principal)
-_hf_client = None
-_bio_local_cache = {"model": None, "tokenizer": None}
 # =========================
 # Prompt helpers
 # =========================
-def _truncate(text, max_chars=3000): return (text or "")[:max_chars]
-def _system_prompt():
-    return ("Eres un asistente clínico educativo. No sustituyes el juicio médico. "
-            "Usa CONTEXTO_OCR si existe; si falta, pídelo. Evita diagnósticos definitivos.")
-def _ocr_context(ocr_md, ocr_txt): return _truncate(ocr_md) or _truncate(ocr_txt) or ""
 def build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg):
-    sys = _system_prompt()
-    ctx = _ocr_context(ocr_md, ocr_txt)
-    history_lines = []
     for m in (chat_msgs or []):
-        role = m.get("role")
-        content = (m.get("content") or "").strip()
-        if not content:
-            continue
-        if role == "user":
-            history_lines.append(f"User: {content}")
-        elif role == "assistant":
-            history_lines.append(f"Assistant: {content}")
-    if user_msg:
-        history_lines.append(f"User: {user_msg}")
-    convo = "\n".join(history_lines).strip()
-    prompt = f"### System\n{sys}\n\n"
-    if ctx:
-        prompt += f"### Context (OCR)\n{ctx}\n\n"
-    prompt += f"### Conversation\n{convo}\nAssistant:"
     return prompt
 # =========================
-# BioMedLM remoto/local (NO CUDA en main)
 # =========================
-def get_biomedlm():
-    """Decidir modo. No tocar CUDA aquí."""
-    global _hf_client
-    if BIO_REMOTE:
-        if _hf_client is None:
-            # timeout va en el constructor (no en la llamada)
-            _hf_client = InferenceClient(
-                model=BIO_MODEL_ID,
-                token=HF_TOKEN,
-                timeout=GEN_TIMEOUT,
-            )
-        return ("remote", _hf_client)
-    return ("local", None)
-def _hf_http_completions(prompt: str) -> str:
-    """Fallback HTTP al router HF (OpenAI-like /v1/completions)."""
-    headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
-    payload = {
-        "model": BIO_MODEL_ID,
-        "prompt": prompt,
-        "max_tokens": GEN_MAX_NEW_TOKENS,
-        "temperature": GEN_TEMPERATURE,
-        "top_p": GEN_TOP_P,
-        "stop": STOP_SEQS,
-    }
-    urls = [
-        "https://router.huggingface.co/v1/completions",
-        "https://router.huggingface.co/hf-inference/v1/completions",
-    ]
-    last_exc = None
-    for url in urls:
-        try:
-            r = requests.post(url, headers=headers, json=payload, timeout=GEN_TIMEOUT)
-            if r.status_code == 200:
-                data = r.json()
-                # OpenAI completions-like
-                if isinstance(data, dict) and "choices" in data and data["choices"]:
-                    return (data["choices"][0].get("text") or "").strip()
-                return json.dumps(data)[:4000]
-            last_exc = RuntimeError(f"HTTP {r.status_code}: {r.text[:800]}")
-        except Exception as e:
-            last_exc = e
-    raise last_exc or RuntimeError("HF router completions error")
-def call_biomedlm_remote(prompt: str) -> (str, str):
     """
-    Usa InferenceClient.text_generation (task soportada por BioMedLM).
-    Si falla, cae a HTTP router /v1/completions.
-    Retorna (respuesta, debug_msg)
     """
-    client = get_biomedlm()[1]
     try:
-        out = client.text_generation(
             prompt=prompt,
             max_new_tokens=GEN_MAX_NEW_TOKENS,
             temperature=GEN_TEMPERATURE,
@@ -131,151 +107,76 @@ def call_biomedlm_remote(prompt: str) -> (str, str):
             repetition_penalty=GEN_REP_PENALTY,
             stop_sequences=STOP_SEQS,
             details=False,
             stream=False,
         )
-        # huggingface_hub devuelve str si details=False
-        answer = out.strip() if isinstance(out, str) else str(out)
-        return answer, ""
-    except Exception as e:
-        if not BIO_FALLBACK_HTTP:
-            raise
-        # Fallback HTTP al router nuevo (completions)
         try:
-            answer = _hf_http_completions(prompt)
-            return answer, f"[Fallback HTTP router/completions] {e.__class__.__name__}: {e}"
         except Exception as e2:
-            raise RuntimeError(
-                f"Remote generation failed: {e.__class__.__name__}: {e} | HTTP fallback: {e2.__class__.__name__}: {e2}"
-            )
-@spaces.GPU
-def biomedlm_infer_local(prompt: str,
-                         temperature=0.2,
-                         top_p=0.9,
-                         rep_penalty=1.1,
-                         max_new_tokens=512) -> str:
-    """Ejecución local en worker GPU; devuelve OK:: o ERR::..."""
-    try:
-        if _bio_local_cache["model"] is None:
-            tok = AutoTokenizer.from_pretrained(BIO_MODEL_ID, use_fast=True)
-            dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) else (
-                torch.float16 if torch.cuda.is_available() else torch.float32
-            )
-            model = AutoModelForCausalLM.from_pretrained(BIO_MODEL_ID, torch_dtype=dtype)
-            if torch.cuda.is_available():
-                model = model.to("cuda")
-            _bio_local_cache["model"] = model.eval()
-            _bio_local_cache["tokenizer"] = tok
-        model = _bio_local_cache["model"]
-        tok = _bio_local_cache["tokenizer"]
-        inputs = tok(prompt, return_tensors="pt")
-        if torch.cuda.is_available():
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
-        gen_ids = model.generate(
-            **inputs,
-            do_sample=True,
-            temperature=temperature,
-            top_p=top_p,
-            repetition_penalty=rep_penalty,
-            max_new_tokens=max_new_tokens,
-            eos_token_id=tok.eos_token_id,
-        )
-        text = tok.decode(gen_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-        return "OK::" + text.strip()
-    except Exception as e:
-        return f"ERR::[{e.__class__.__name__}] {str(e) or repr(e)}"
-def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
     try:
-        if not user_msg:
-            user_msg = "Analiza el CONTEXTO_OCR anterior y responde a partir de ese contenido."
         prompt = build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg)
-        mode, _ = get_biomedlm()
-        if mode == "remote":
-            try:
-                answer, dbg = call_biomedlm_remote(prompt)
-                updated = (chat_msgs or []) + [
-                    {"role": "user", "content": user_msg},
-                    {"role": "assistant", "content": answer}
-                ]
-                return updated, "", gr.update(value=dbg)
-            except Exception as e_remote:
-                if not BIO_FALLBACK_LOCAL:
-                    raise
-                # Fallback a local si remoto no disponible
-                res = biomedlm_infer_local(
-                    prompt,
-                    temperature=GEN_TEMPERATURE,
-                    top_p=GEN_TOP_P,
-                    rep_penalty=GEN_REP_PENALTY,
-                    max_new_tokens=GEN_MAX_NEW_TOKENS
-                )
-                if res.startswith("OK::"):
-                    answer = res[4:]
-                    updated = (chat_msgs or []) + [
-                        {"role": "user", "content": user_msg},
-                        {"role": "assistant", "content": answer}
-                    ]
-                    return updated, "", gr.update(value=f"[Remoto→Local] {e_remote}")
-                else:
-                    err_msg = res[5:] if res.startswith("ERR::") else res
-                    raise RuntimeError(f"Remote error: {e_remote} | Local error: {err_msg}")
-        # Modo local explícito
-        res = biomedlm_infer_local(
-            prompt,
-            temperature=GEN_TEMPERATURE,
-            top_p=GEN_TOP_P,
-            rep_penalty=GEN_REP_PENALTY,
-            max_new_tokens=GEN_MAX_NEW_TOKENS
-        )
-        if res.startswith("OK::"):
-            answer = res[4:]
-            updated = (chat_msgs or []) + [
-                {"role": "user", "content": user_msg},
-                {"role": "assistant", "content": answer}
-            ]
-            return updated, "", gr.update(value="")
-        else:
-            err_msg = res[5:] if res.startswith("ERR::") else res
-            updated = (chat_msgs or []) + [
-                {"role": "user", "content": user_msg},
-                {"role": "assistant", "content": "⚠️ Error LLM (local). Revisa el panel de debug."}
-            ]
-            return updated, "", gr.update(value=err_msg)
     except Exception as e:
-        err = f"{e.__class__.__name__}: {str(e) or repr(e)}"
         tb = traceback.format_exc(limit=2)
         updated = (chat_msgs or []) + [
             {"role": "user", "content": user_msg or ""},
-            {"role": "assistant", "content": f"⚠️ Error LLM: {err}"}
         ]
-        return updated, "", gr.update(value=f"{err}\n{tb}")
 def clear_chat(): return [], "", gr.update(value="")
 # =========================
-# DeepSeek-OCR (sin CUDA en main)
 # =========================
 def _load_ocr_model():
     model_name = "deepseek-ai/DeepSeek-OCR"
-    ocr_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     attn_impl = os.getenv("OCR_ATTN_IMPL", "flash_attention_2")
     try:
-        ocr_model = AutoModel.from_pretrained(
-            model_name, _attn_implementation=attn_impl, trust_remote_code=True, use_safetensors=True
         ).eval()
-        return ocr_tokenizer, ocr_model
     except Exception as e:
         if any(k in str(e).lower() for k in ["flash_attn", "flashattention2", "flash_attention_2"]):
-            ocr_model = AutoModel.from_pretrained(
-                model_name, _attn_implementation="eager", trust_remote_code=True, use_safetensors=True
             ).eval()
-            return ocr_tokenizer, ocr_model
         raise
 tokenizer, model = _load_ocr_model()
@@ -285,6 +186,7 @@ def process_image(image, model_size, task_type, is_eval_mode):
     if image is None:
         return None, "Please upload an image first.", "Please upload an image first."
     if torch.cuda.is_available():
         dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
         model_device = model.to(dtype).to("cuda")
@@ -297,23 +199,23 @@ def process_image(image, model_size, task_type, is_eval_mode):
         temp_image_path = os.path.join(output_path, "temp_image.jpg")
         image.save(temp_image_path)
-        size_configs = {
-            "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
-            "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
-            "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
-            "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
-            "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
         }
-        config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
-        plain_text_result = model_device.infer(
             tokenizer,
             prompt=prompt,
             image_file=temp_image_path,
             output_path=output_path,
-            base_size=config["base_size"],
-            image_size=config["image_size"],
-            crop_mode=config["crop_mode"],
             save_results=True,
             test_compress=True,
             eval_mode=is_eval_mode,
@@ -331,18 +233,18 @@ def process_image(image, model_size, task_type, is_eval_mode):
         if os.path.exists(image_result_path):
             result_image = Image.open(image_result_path); result_image.load()
-        text_result = plain_text_result if plain_text_result else markdown_content
         return result_image, markdown_content, text_result
 # =========================
 # UI (Gradio 5)
 # =========================
-with gr.Blocks(title="DeepSeek-OCR + BioMedLM", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # DeepSeek-OCR → Chat Médico con **BioMedLM**
         1) **Sube una imagen** y corre **OCR** (imagen anotada, Markdown y texto).
-        2) **Chatea** con **BioMedLM** usando automáticamente el **OCR** como contexto.
         *Uso educativo; no reemplaza consejo médico.*
         """
     )
@@ -353,10 +255,14 @@ with gr.Blocks(title="DeepSeek-OCR + BioMedLM", theme=gr.themes.Soft()) as demo:
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard", "webcam"])
-            model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
-                                     value="Gundam (Recommended)", label="Model Size")
-            task_type = gr.Dropdown(choices=["Free OCR", "Convert to Markdown"],
-                                    value="Convert to Markdown", label="Task Type")
             eval_mode_checkbox = gr.Checkbox(value=False, label="Enable Evaluation Mode",
                                              info="Solo texto (más rápido). Desmárcalo para ver imagen anotada y markdown.")
             submit_btn = gr.Button("Process Image", variant="primary")
@@ -365,37 +271,7 @@ with gr.Blocks(title="DeepSeek-OCR + BioMedLM", theme=gr.themes.Soft()) as demo:
             with gr.Tabs():
                 with gr.TabItem("Annotated Image"): output_image = gr.Image(interactive=False)
                 with gr.TabItem("Markdown Preview"): output_markdown = gr.Markdown()
-                with gr.TabItem("Markdown Source (or Eval Output)"):
-                    output_text = gr.Textbox(lines=18, show_copy_button=True, interactive=False)
             with gr.Row():
                 md_preview = gr.Textbox(label="Snapshot Markdown OCR", lines=10, interactive=False)
-                txt_preview = gr.Textbox(label="Snapshot Texto OCR", lines=10, interactive=False)
-    gr.Markdown("## Chat Clínico (BioMedLM)")
-    with gr.Row():
-        with gr.Column(scale=2):
-            chatbot = gr.Chatbot(label="Asistente OCR (BioMedLM)", type="messages", height=420)
-            user_in = gr.Textbox(label="Mensaje", placeholder="Escribe tu consulta… (vacío = analiza solo el OCR)", lines=2)
-            with gr.Row():
-                send_btn = gr.Button("Enviar", variant="primary")
-                clear_btn = gr.Button("Limpiar")
-        with gr.Column(scale=1):
-            error_box = gr.Textbox(label="Debug (si hay error)", lines=8, interactive=False)
-    submit_btn.click(
-        fn=process_image,
-        inputs=[image_input, model_size, task_type, eval_mode_checkbox],
-        outputs=[output_image, output_markdown, output_text],
-    ).then(
-        fn=lambda md, tx: (md, tx, md, tx),
-        inputs=[output_markdown, output_text],
-        outputs=[ocr_md_state, ocr_txt_state, md_preview, txt_preview],
-    )
-    send_btn.click(fn=biomedlm_reply, inputs=[user_in, chatbot, ocr_md_state, ocr_txt_state],
-                   outputs=[chatbot, user_in, error_box])
-    clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, error_box])
-if __name__ == "__main__":
-    demo.queue(max_size=20)
-    demo.launch()

+# app.py — DeepSeek-OCR + Med42 Instruct (remoto, ZeroGPU-safe) — Gradio 5
+import os, re, json, tempfile, traceback
 import gradio as gr
 import torch
 from PIL import Image
+from transformers import AutoModel, AutoTokenizer
 import spaces
 from huggingface_hub import InferenceClient
 import requests
 # =========================
 # CONFIG (env)
 # =========================
+LLM_MODEL_ID = os.getenv("BIO_MODEL_ID", "m42-health/Llama3-Med42-8B-Instruct").strip()
 HF_TOKEN = os.getenv("HF_TOKEN")
+# Generación (determinista para obediencia)
+GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.0"))
+GEN_TOP_P = float(os.getenv("GEN_TOP_P", "1.0"))
+GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "384"))
+GEN_REP_PENALTY = float(os.getenv("GEN_REP_PENALTY", "1.0"))
 GEN_TIMEOUT = int(os.getenv("GEN_TIMEOUT", "60"))  # s
+STOP_SEQS = ["\n###", "\nUser:", "\nAssistant:"]
+# Cliente remoto (HTTP) — no toca CUDA
+_hf_client = InferenceClient(model=LLM_MODEL_ID, token=HF_TOKEN, timeout=GEN_TIMEOUT)
 # =========================
 # Prompt helpers
 # =========================
+def _truncate(s: str, n=3000): return (s or "")[:n]
+def _clean_ocr(s: str) -> str:
+    if not s: return ""
+    s = re.sub(r'[^\S\r\n]+', ' ', s)                     # colapsa espacios
+    s = re.sub(r'(\{#Sec\d+\}|#+\w*)', ' ', s)            # anchors/headers raros
+    s = re.sub(r'\s{2,}', ' ', s)
+    lines = []
+    for par in s.splitlines():
+        par = par.strip()
+        if 0 < len(par) <= 600:
+            lines.append(par)
+    return "\n".join(lines)
+FEWSHOT = """
+### INSTRUCCIÓN
+Eres un **analista clínico educativo**. Responde **SIEMPRE en español**.
+Reglas: (1) Usa ÚNICAMENTE el CONTEXTO_OCR; (2) Si falta un dato, escribe literalmente: "dato no disponible en el OCR";
+(3) No inventes nada; (4) Responde en viñetas claras; (5) Cita fragmentos exactos del OCR entre comillas como evidencia.
+### EJEMPLO 1
+CONTEXTO_OCR:
+Paciente: Juan Pérez. Medicamento: Amoxicilina 500 mg cada 8 horas por 7 días.
+PREGUNTA:
+¿Cuál es el medicamento y la dosis?
+SALIDA_ES:
+- Medicamento: **Amoxicilina**
+- Dosis: **500 mg cada 8 horas por 7 días**
+- Evidencia OCR: "Amoxicilina 500 mg cada 8 horas por 7 días"
+### EJEMPLO 2
+CONTEXTO_OCR:
+Paciente: —. Indicaciones ilegibles.
+PREGUNTA:
+¿Hay contraindicaciones registradas?
+SALIDA_ES:
+- Contraindicaciones: **dato no disponible en el OCR**
+- Evidencia OCR: "Indicaciones ilegibles"
+""".strip()
 def build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg):
+    raw = ocr_md if (ocr_md and ocr_md.strip()) else ocr_txt
+    ctx = _truncate(_clean_ocr(raw), 3000)
+    history = []
     for m in (chat_msgs or []):
+        role, content = m.get("role"), (m.get("content") or "").strip()
+        if not content: continue
+        history.append(f"- { 'Usuario' if role=='user' else 'Asistente' }: {content}")
+    hist_block = "\n".join(history) if history else "—"
+    question = (user_msg or "Analiza el CONTEXTO_OCR y resume lo clínicamente relevante en viñetas.").strip()
+    prompt = (
+        FEWSHOT + "\n\n"
+        "### CONTEXTO_OCR\n" + (ctx if ctx else "—") + "\n\n"
+        "### HISTORIAL (si existe)\n" + hist_block + "\n\n"
+        "### PREGUNTA\n" + question + "\n\n"
+        "### SALIDA_ES\n"
+    )
     return prompt
 # =========================
+# LLM remoto (Med42 Instruct) — text_generation
 # =========================
+def med42_remote_generate(prompt: str) -> (str, str):
     """
+    Intenta InferenceClient.text_generation (serverless/TGI). Si falla,
+    hace fallback al router OpenAI-like /v1/completions.
     """
     try:
+        out = _hf_client.text_generation(
             prompt=prompt,
             max_new_tokens=GEN_MAX_NEW_TOKENS,
             temperature=GEN_TEMPERATURE,
             repetition_penalty=GEN_REP_PENALTY,
             stop_sequences=STOP_SEQS,
             details=False,
+            do_sample=False,          # determinista
             stream=False,
         )
+        return (out.strip() if isinstance(out, str) else str(out)), ""
+    except Exception as e1:
+        # Fallback HTTP al router
         try:
+            headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
+            payload = {
+                "model": LLM_MODEL_ID,
+                "prompt": prompt,
+                "max_tokens": GEN_MAX_NEW_TOKENS,
+                "temperature": GEN_TEMPERATURE,
+                "top_p": GEN_TOP_P,
+                "stop": STOP_SEQS,
+            }
+            for url in ["https://router.huggingface.co/v1/completions",
+                        "https://router.huggingface.co/hf-inference/v1/completions"]:
+                r = requests.post(url, headers=headers, json=payload, timeout=GEN_TIMEOUT)
+                if r.status_code == 200:
+                    data = r.json()
+                    if isinstance(data, dict) and "choices" in data and data["choices"]:
+                        return (data["choices"][0].get("text") or "").strip(), f"[Fallback router: {url}] {e1}"
+            raise RuntimeError(f"HTTP {r.status_code}: {r.text[:800]}")
         except Exception as e2:
+            raise RuntimeError(f"Remote generation failed: {e1.__class__.__name__}: {e1} | HTTP fallback: {e2.__class__.__name__}: {e2}")
+def med42_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
     try:
         prompt = build_prompt(chat_msgs, ocr_md, ocr_txt, user_msg)
+        answer, dbg = med42_remote_generate(prompt)
+        updated = (chat_msgs or []) + [
+            {"role": "user", "content": user_msg or "(analizar solo OCR)"},
+            {"role": "assistant", "content": answer}
+        ]
+        return updated, "", gr.update(value=dbg)
     except Exception as e:
         tb = traceback.format_exc(limit=2)
         updated = (chat_msgs or []) + [
             {"role": "user", "content": user_msg or ""},
+            {"role": "assistant", "content": f"⚠️ Error LLM: {e}"}
         ]
+        return updated, "", gr.update(value=f"{e}\n{tb}")
 def clear_chat(): return [], "", gr.update(value="")
 # =========================
+# DeepSeek-OCR (sin CUDA en main, GPU solo dentro del worker)
 # =========================
 def _load_ocr_model():
     model_name = "deepseek-ai/DeepSeek-OCR"
+    tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     attn_impl = os.getenv("OCR_ATTN_IMPL", "flash_attention_2")
     try:
+        mdl = AutoModel.from_pretrained(
+            model_name,
+            _attn_implementation=attn_impl,
+            trust_remote_code=True,
+            use_safetensors=True
         ).eval()
+        return tok, mdl
     except Exception as e:
         if any(k in str(e).lower() for k in ["flash_attn", "flashattention2", "flash_attention_2"]):
+            mdl = AutoModel.from_pretrained(
+                model_name,
+                _attn_implementation="eager",
+                trust_remote_code=True,
+                use_safetensors=True
             ).eval()
+            return tok, mdl
         raise
 tokenizer, model = _load_ocr_model()
     if image is None:
         return None, "Please upload an image first.", "Please upload an image first."
+    # mover a GPU SOLO dentro del worker
     if torch.cuda.is_available():
         dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
         model_device = model.to(dtype).to("cuda")
         temp_image_path = os.path.join(output_path, "temp_image.jpg")
         image.save(temp_image_path)
+        size_cfg = {
+            "Tiny":  (512,  512,  False),
+            "Small": (640,  640,  False),
+            "Base":  (1024, 1024, False),
+            "Large": (1280, 1280, False),
+            "Gundam (Recommended)": (1024, 640, True),
         }
+        base_size, image_size, crop_mode = size_cfg.get(model_size, (1024, 640, True))
+        plain_text = model_device.infer(
             tokenizer,
             prompt=prompt,
             image_file=temp_image_path,
             output_path=output_path,
+            base_size=base_size,
+            image_size=image_size,
+            crop_mode=crop_mode,
             save_results=True,
             test_compress=True,
             eval_mode=is_eval_mode,
         if os.path.exists(image_result_path):
             result_image = Image.open(image_result_path); result_image.load()
+        text_result = plain_text if plain_text else markdown_content
         return result_image, markdown_content, text_result
 # =========================
 # UI (Gradio 5)
 # =========================
+with gr.Blocks(title="DeepSeek-OCR + Med42 Instruct", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # DeepSeek-OCR → Chat Clínico con **Med42 Instruct**
         1) **Sube una imagen** y corre **OCR** (imagen anotada, Markdown y texto).
+        2) **Chatea** con **Med42** usando automáticamente el **OCR** como contexto.
         *Uso educativo; no reemplaza consejo médico.*
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard", "webcam"])
+            model_size = gr.Dropdown(
+                choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
+                value="Gundam (Recommended)", label="Model Size"
+            )
+            task_type = gr.Dropdown(
+                choices=["Free OCR", "Convert to Markdown"],
+                value="Convert to Markdown", label="Task Type"
+            )
             eval_mode_checkbox = gr.Checkbox(value=False, label="Enable Evaluation Mode",
                                              info="Solo texto (más rápido). Desmárcalo para ver imagen anotada y markdown.")
             submit_btn = gr.Button("Process Image", variant="primary")
             with gr.Tabs():
                 with gr.TabItem("Annotated Image"): output_image = gr.Image(interactive=False)
                 with gr.TabItem("Markdown Preview"): output_markdown = gr.Markdown()
+                with gr.TabItem("Markdown Source / Eval"): output_text = gr.Textbox(lines=18, show_copy_button=True, interactive=False)
             with gr.Row():
                 md_preview = gr.Textbox(label="Snapshot Markdown OCR", lines=10, interactive=False)
+                txt_preview = gr.Textbox_