jorgeiv500 commited on
Commit
42632ea
·
1 Parent(s): 2c7042c
Files changed (3) hide show
  1. README.md +18 -45
  2. app.py +76 -154
  3. requirements.txt +4 -5
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: OpScan.IA — DeepSeek-OCR + DeepSeek-R1 Medical Mini
3
  emoji: 🩺
4
  colorFrom: gray
5
  colorTo: purple
@@ -9,51 +9,24 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- # OpScan.IA — DeepSeek-OCR + DeepSeek-R1 Medical Mini
13
 
14
- Aplicación en **Gradio** que:
15
- 1) Extrae texto y marcas de un documento/imagen con **DeepSeek-OCR**.
16
- 2) Inyecta automáticamente ese OCR como **contexto** para chatear con **DeepSeek-R1 Medical Mini** (remoto o GGUF local).
17
 
18
- > **Uso educativo**. No sustituye criterio clínico ni diagnóstico profesional.
 
19
 
20
- ---
21
-
22
- ## Características
23
- - **OCR**: cajas, Markdown y/o texto plano a partir de imágenes (upload/clipboard/cámara).
24
- - **Chat clínico**: el LLM recibe el OCR como *system context* y responde con cautela.
25
- - **Modos del chat**:
26
- - **Remoto (HF Inference)**: `R1_REMOTE=1` (sin token si el modelo es público).
27
- - **Local GGUF (CPU/Zero)**: `R1_REMOTE=0` con `llama.cpp`.
28
- - **Tolerante a entorno**: si el OCR falla por `FlashAttention2`, cae a `_attn_implementation="eager"` automáticamente.
29
-
30
- ---
31
-
32
- ## 📦 Requisitos
33
-
34
- `requirements.txt`:
35
 
36
- ```txt
37
- gradio==5.49.1
38
- spaces>=0.28.3
39
- torch==2.6.0
40
- torchvision==0.21.0
41
- transformers==4.46.3
42
- tokenizers==0.20.3
43
- accelerate>=0.34.2
44
- safetensors>=0.4.5
45
- huggingface-hub>=0.30.0
46
- hf-transfer>=0.1.6
47
- pillow>=10.4.0
48
- numpy>=1.26.0
49
- tqdm>=4.66.4
50
- requests>=2.31.0
51
- einops>=0.7.0
52
- addict>=2.4.0
53
- easydict>=1.13
54
- sentencepiece>=0.2.0
55
- pydantic==2.10.6
56
- protobuf<4
57
- click<8.1
58
- llama-cpp-python==0.2.90
59
- # (Opcional GPU) flash-attn / xformers
 
1
  ---
2
+ title: OpScan.IA — DeepSeek-OCR + R1 Medical Mini (GGUF rápido)
3
  emoji: 🩺
4
  colorFrom: gray
5
  colorTo: purple
 
9
  pinned: false
10
  ---
11
 
12
+ # OpScan.IA — DeepSeek-OCR + DeepSeek-R1 Medical Mini (GGUF local rápido)
13
 
14
+ **Objetivo:** máxima velocidad **sin tokens** en Spaces Zero/CPU.
15
+ El chat usa **DeepSeek-R1 Medical Mini** en **GGUF** (cuantizado Q4 si está disponible) con `llama.cpp`.
16
+ El OCR se hace con **DeepSeek-OCR** (con *fallback* automático a `_attn_implementation="eager"` si no hay FlashAttention2).
17
 
18
+ ## Requisitos
19
+ Ver `requirements.txt`.
20
 
21
+ ## Variables opcionales
22
+ - `GGUF_REPO` (default: `mradermacher/DeepSeek-r1-Medical-Mini-GGUF`)
23
+ - `GGUF_FILE` (si no se define, el app prueba en orden: `Q4_K_M`, `Q4_0`, `Q5_0`, `Q8_0`, `f16`)
24
+ - `N_CTX` (2048), `N_THREADS` (auto), `N_GPU_LAYERS` (0), `N_BATCH` (96), `WARMUP` (0/1)
25
+ - `OCR_ATTN_IMPL`: `flash_attention_2` o `eager`
 
 
 
 
 
 
 
 
 
 
26
 
27
+ ## Ejecución local
28
+ ```bash
29
+ python -m venv .venv
30
+ source .venv/bin/activate
31
+ pip install -r requirements.txt
32
+ python app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,103 +1,55 @@
1
- # app.py — DeepSeek-OCR + DeepSeek-R1 Medical Mini (remoto HF o local GGUF) — Gradio 5
2
  import os, tempfile, traceback
3
  import gradio as gr
4
  import torch
5
  from PIL import Image
6
  from transformers import AutoModel, AutoTokenizer
7
  import spaces
8
- from huggingface_hub import hf_hub_download, InferenceClient
9
  from llama_cpp import Llama
10
 
11
  # ===============================================================
12
- # Configuración LLM (CHAT) DeepSeek-R1 Medical Mini
13
- # - Remoto (HF Inference): R1_REMOTE=1 y (opcional) R1_MODEL_ID, HF_TOKEN
14
- # - Local GGUF (CPU/Zero): R1_REMOTE=0 y GGUF_REPO / GGUF_FILE
15
  # ===============================================================
16
- R1_REMOTE = os.getenv("R1_REMOTE", "0") == "1"
17
- R1_MODEL_ID = os.getenv("R1_MODEL_ID", "Mouhib007/DeepSeek-r1-Medical-Mini")
18
- HF_TOKEN = os.getenv("HF_TOKEN") # público -> puede ser None
19
-
20
- # ---- Local GGUF (fallback / modo offline) ----
21
- GGUF_CANDIDATES = []
22
- ENV_REPO = os.getenv("GGUF_REPO", "").strip()
23
- ENV_FILE = os.getenv("GGUF_FILE", "").strip()
24
- if ENV_REPO and ENV_FILE:
25
- GGUF_CANDIDATES.append((ENV_REPO, ENV_FILE))
26
- # Candidato por defecto (ajústalo si usas otro)
27
- GGUF_CANDIDATES.append((
28
- "mradermacher/DeepSeek-r1-Medical-Mini-GGUF",
29
- "DeepSeek-r1-Medical-Mini.f16.gguf"
30
- ))
31
 
32
  N_CTX = int(os.getenv("N_CTX", "2048"))
33
  N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
34
- N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0"))
35
  N_BATCH = int(os.getenv("N_BATCH", "96"))
36
 
37
- # ---- Cliente remoto (HF Inference) ----
38
- _remote_client = None
39
- def get_remote_client():
40
- global _remote_client
41
- if _remote_client is None:
42
- _remote_client = InferenceClient(model=R1_MODEL_ID, token=HF_TOKEN, timeout=60)
43
- return _remote_client
44
-
45
- # ---- Formato ChatML (compatible con DeepSeek/Qwen) ----
46
- def _format_chatml(messages):
47
- parts = []
48
- for m in messages:
49
- role = m.get("role", "user")
50
- content = m.get("content", "")
51
- parts.append(f"<|im_start|>{role}\n{content}<|im_end|>\n")
52
- parts.append("<|im_start|>assistant\n")
53
- return "".join(parts)
54
-
55
- def r1_chat(messages, temperature=0.2, max_tokens=384):
56
- """Remoto (HF) o local (llama-cpp) para DeepSeek-R1 Medical Mini."""
57
- if R1_REMOTE:
58
- client = get_remote_client()
59
- try:
60
- # Algunos endpoints soportan chat_completion
61
- resp = client.chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens)
62
- return resp.choices[0].message["content"]
63
- except Exception:
64
- # Fallback universal a text_generation con ChatML
65
- try:
66
- prompt = _format_chatml(messages)
67
- return client.text_generation(
68
- prompt,
69
- max_new_tokens=max_tokens,
70
- temperature=temperature,
71
- stop_sequences=["<|im_end|>"],
72
- stream=False,
73
- )
74
- except Exception:
75
- # Si remoto falla (401/429/etc), caemos a local si hay GGUF
76
- pass
77
- # Local GGUF
78
- llm = get_llm()
79
- out = llm.create_chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens)
80
- return out["choices"][0]["message"]["content"]
81
-
82
- # ---- Loader local (GGUF) ----
83
  _llm = None
84
  def _download_gguf():
85
  last_err = None
86
- for repo, fname in GGUF_CANDIDATES:
87
  try:
88
- return hf_hub_download(repo_id=repo, filename=fname), repo, fname
 
89
  except Exception as e:
90
  last_err = e
91
- raise RuntimeError(f"No se pudo descargar ningún GGUF. Último error: {last_err}")
92
 
93
  def get_llm():
94
  global _llm
95
  if _llm is not None:
96
  return _llm
97
- gguf_path, _, _ = _download_gguf()
 
98
  _llm = Llama(
99
  model_path=gguf_path,
100
- # No forzamos chat_format; usamos el del GGUF del R1
101
  n_ctx=N_CTX,
102
  n_threads=N_THREADS,
103
  n_gpu_layers=N_GPU_LAYERS,
@@ -106,15 +58,26 @@ def get_llm():
106
  )
107
  return _llm
108
 
109
- # Warmup opcional (para no esperar en el primer mensaje si usas local)
110
- if os.getenv("WARMUP", "0") == "1" and not R1_REMOTE:
111
- try:
112
- get_llm()
113
- except Exception:
114
- pass
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  # ===============================================================
117
- # DeepSeek-OCR (INTACTO con fallback si no hay FlashAttention2)
118
  # ===============================================================
119
  def _best_dtype():
120
  if torch.cuda.is_available():
@@ -124,24 +87,16 @@ def _best_dtype():
124
  def _load_ocr_model():
125
  model_name = "deepseek-ai/DeepSeek-OCR"
126
  ocr_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
127
- attn_impl = os.getenv("OCR_ATTN_IMPL", "flash_attention_2") # por defecto igual que antes
128
  try:
129
  ocr_model = AutoModel.from_pretrained(
130
- model_name,
131
- _attn_implementation=attn_impl,
132
- trust_remote_code=True,
133
- use_safetensors=True,
134
  ).eval()
135
  return ocr_tokenizer, ocr_model
136
  except Exception as e:
137
- # Si falla por FlashAttention2, reintenta en modo "eager" (CPU/compat)
138
- msg = str(e)
139
- if "flash_attn" in msg or "FlashAttention2" in msg or "flash_attention_2" in msg:
140
  ocr_model = AutoModel.from_pretrained(
141
- model_name,
142
- _attn_implementation="eager",
143
- trust_remote_code=True,
144
- use_safetensors=True,
145
  ).eval()
146
  return ocr_tokenizer, ocr_model
147
  raise
@@ -150,22 +105,13 @@ tokenizer, model = _load_ocr_model()
150
 
151
  @spaces.GPU
152
  def process_image(image, model_size, task_type, is_eval_mode):
153
- """
154
- Devuelve: imagen anotada, markdown y texto (o markdown si no hay texto).
155
- """
156
  if image is None:
157
  return None, "Please upload an image first.", "Please upload an image first."
158
  dtype = _best_dtype()
159
  model_device = model.cuda().to(dtype) if torch.cuda.is_available() else model.to(dtype)
160
 
161
  with tempfile.TemporaryDirectory() as output_path:
162
- if task_type == "Free OCR":
163
- prompt = "<image>\nFree OCR. "
164
- elif task_type == "Convert to Markdown":
165
- prompt = "<image>\n<|grounding|>Convert the document to markdown. "
166
- else:
167
- prompt = "<image>\nFree OCR. "
168
-
169
  temp_image_path = os.path.join(output_path, "temp_image.jpg")
170
  image.save(temp_image_path)
171
 
@@ -194,44 +140,36 @@ def process_image(image, model_size, task_type, is_eval_mode):
194
  image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
195
  markdown_result_path = os.path.join(output_path, "result.mmd")
196
 
 
197
  if os.path.exists(markdown_result_path):
198
  with open(markdown_result_path, "r", encoding="utf-8") as f:
199
  markdown_content = f.read()
200
- else:
201
- markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
202
 
203
  result_image = None
204
  if os.path.exists(image_result_path):
205
- result_image = Image.open(image_result_path)
206
- result_image.load()
207
 
208
  text_result = plain_text_result if plain_text_result else markdown_content
209
  return result_image, markdown_content, text_result
210
 
211
  # ===============================================================
212
- # Chat (inyecta OCR en el primer system) — usando R1
213
  # ===============================================================
214
- def _truncate(text, max_chars=3000):
215
- return (text or "")[:max_chars]
216
 
217
  def _system_prompt():
218
- return (
219
- "Eres un asistente clínico educativo. No sustituyes el juicio médico. "
220
- "Usa CONTEXTO_OCR si existe; si falta, pídelo. Evita diagnósticos definitivos."
221
- )
222
 
223
- def _ocr_context(ocr_md, ocr_txt):
224
- return _truncate(ocr_md) or _truncate(ocr_txt) or ""
225
 
226
  def to_chat_messages(chat_msgs, ocr_md, ocr_txt):
227
  sys = _system_prompt()
228
  ctx = _ocr_context(ocr_md, ocr_txt)
229
  if ctx:
230
- sys += (
231
- "\n\n---\n"
232
- "CONTEXTO_OCR (fuente principal; si falta un dato, dilo explícitamente):\n"
233
- f"{ctx}\n---"
234
- )
235
  msgs = [{"role": "system", "content": sys}]
236
  for m in (chat_msgs or []):
237
  if m.get("role") in ("user", "assistant"):
@@ -243,33 +181,28 @@ def r1_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
243
  user_msg = "Analiza el CONTEXTO_OCR anterior y responde a partir de ese contenido."
244
  try:
245
  msgs = to_chat_messages(chat_msgs, ocr_md, ocr_txt) + [{"role": "user", "content": user_msg}]
246
- answer = r1_chat(msgs, temperature=0.2, max_tokens=512)
247
- updated = (chat_msgs or []) + [
248
- {"role": "user", "content": user_msg},
249
- {"role": "assistant", "content": answer},
250
- ]
251
  return updated, "", gr.update(value="")
252
  except Exception as e:
253
  err = f"{e.__class__.__name__}: {str(e) or repr(e)}"
254
  tb = traceback.format_exc(limit=2)
255
- updated = (chat_msgs or []) + [
256
- {"role": "user", "content": user_msg or ""},
257
- {"role": "assistant", "content": f"⚠️ Error LLM: {err}"},
258
- ]
259
  return updated, "", gr.update(value=f"{err}\n{tb}")
260
 
261
- def clear_chat():
262
- return [], "", gr.update(value="")
263
 
264
  # ===============================================================
265
  # UI (Gradio 5)
266
  # ===============================================================
267
- with gr.Blocks(title="DeepSeek-OCR + DeepSeek-R1 Medical Mini", theme=gr.themes.Soft()) as demo:
268
  gr.Markdown(
269
  """
270
- # DeepSeek-OCR → Chat Médico con **DeepSeek-R1 Medical Mini** (remoto HF o local GGUF)
271
  1) **Sube una imagen** y corre **OCR** (imagen anotada, Markdown y texto).
272
- 2) **Chatea** con **DeepSeek-R1 Medical Mini** usando automáticamente el **OCR** como contexto.
273
  *Uso educativo; no reemplaza consejo médico.*
274
  """
275
  )
@@ -280,18 +213,12 @@ with gr.Blocks(title="DeepSeek-OCR + DeepSeek-R1 Medical Mini", theme=gr.themes.
280
  with gr.Row():
281
  with gr.Column(scale=1):
282
  image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard", "webcam"])
283
- model_size = gr.Dropdown(
284
- choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
285
- value="Gundam (Recommended)", label="Model Size",
286
- )
287
- task_type = gr.Dropdown(
288
- choices=["Free OCR", "Convert to Markdown"],
289
- value="Convert to Markdown", label="Task Type",
290
- )
291
- eval_mode_checkbox = gr.Checkbox(
292
- value=False, label="Enable Evaluation Mode",
293
- info="Solo texto (más rápido). Desmárcalo para ver imagen anotada y markdown.",
294
- )
295
  submit_btn = gr.Button("Process Image", variant="primary")
296
 
297
  with gr.Column(scale=2):
@@ -304,10 +231,10 @@ with gr.Blocks(title="DeepSeek-OCR + DeepSeek-R1 Medical Mini", theme=gr.themes.
304
  md_preview = gr.Textbox(label="Snapshot Markdown OCR", lines=10, interactive=False)
305
  txt_preview = gr.Textbox(label="Snapshot Texto OCR", lines=10, interactive=False)
306
 
307
- gr.Markdown("## Chat Clínico (DeepSeek-R1 Medical Mini)")
308
  with gr.Row():
309
  with gr.Column(scale=2):
310
- chatbot = gr.Chatbot(label="Asistente OCR (R1 Medical Mini)", type="messages", height=420)
311
  user_in = gr.Textbox(label="Mensaje", placeholder="Escribe tu consulta… (vacío = analiza solo el OCR)", lines=2)
312
  with gr.Row():
313
  send_btn = gr.Button("Enviar", variant="primary")
@@ -315,7 +242,6 @@ with gr.Blocks(title="DeepSeek-OCR + DeepSeek-R1 Medical Mini", theme=gr.themes.
315
  with gr.Column(scale=1):
316
  error_box = gr.Textbox(label="Debug (si hay error)", lines=8, interactive=False)
317
 
318
- # OCR → outputs y estados
319
  submit_btn.click(
320
  fn=process_image,
321
  inputs=[image_input, model_size, task_type, eval_mode_checkbox],
@@ -326,12 +252,8 @@ with gr.Blocks(title="DeepSeek-OCR + DeepSeek-R1 Medical Mini", theme=gr.themes.
326
  outputs=[ocr_md_state, ocr_txt_state, md_preview, txt_preview],
327
  )
328
 
329
- # Chat
330
- send_btn.click(
331
- fn=r1_reply,
332
- inputs=[user_in, chatbot, ocr_md_state, ocr_txt_state],
333
- outputs=[chatbot, user_in, error_box],
334
- )
335
  clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, error_box])
336
 
337
  if __name__ == "__main__":
 
1
+ # app.py — DeepSeek-OCR + DeepSeek-R1 Medical Mini (GGUF local rápido) — Gradio 5
2
  import os, tempfile, traceback
3
  import gradio as gr
4
  import torch
5
  from PIL import Image
6
  from transformers import AutoModel, AutoTokenizer
7
  import spaces
8
+ from huggingface_hub import hf_hub_download
9
  from llama_cpp import Llama
10
 
11
  # ===============================================================
12
+ # CHAT: DeepSeek-R1 Medical Mini — SOLO LOCAL (GGUF) para máxima rapidez sin tokens
13
+ # - Puedes forzar un archivo con GGUF_REPO / GGUF_FILE
14
+ # - Si no especificas, probamos Q4 (rápido) y caemos a f16 si no está
15
  # ===============================================================
16
+ GGUF_REPO = os.getenv("GGUF_REPO", "mradermacher/DeepSeek-r1-Medical-Mini-GGUF").strip()
17
+ GGUF_FILE = os.getenv("GGUF_FILE", "").strip()
18
+
19
+ # Orden de preferencia (más rápido -> más pesado). Cambia nombres si tu repo usa otros.
20
+ _DEFAULT_CANDIDATES = [
21
+ "DeepSeek-r1-Medical-Mini.Q4_K_M.gguf",
22
+ "DeepSeek-r1-Medical-Mini.Q4_0.gguf",
23
+ "DeepSeek-r1-Medical-Mini.Q5_0.gguf",
24
+ "DeepSeek-r1-Medical-Mini.Q8_0.gguf",
25
+ "DeepSeek-r1-Medical-Mini.f16.gguf",
26
+ ]
27
+ GGUF_CANDIDATES = [GGUF_FILE] if GGUF_FILE else _DEFAULT_CANDIDATES
 
 
 
28
 
29
  N_CTX = int(os.getenv("N_CTX", "2048"))
30
  N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
31
+ N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0")) # Zero/CPU => 0
32
  N_BATCH = int(os.getenv("N_BATCH", "96"))
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  _llm = None
35
  def _download_gguf():
36
  last_err = None
37
+ for fname in GGUF_CANDIDATES:
38
  try:
39
+ path = hf_hub_download(repo_id=GGUF_REPO, filename=fname)
40
+ return path, fname
41
  except Exception as e:
42
  last_err = e
43
+ raise RuntimeError(f"No se pudo descargar GGUF desde {GGUF_REPO}. Último error: {last_err}")
44
 
45
  def get_llm():
46
  global _llm
47
  if _llm is not None:
48
  return _llm
49
+ gguf_path, used = _download_gguf()
50
+ print(f"[R1/llama.cpp] usando: {used}")
51
  _llm = Llama(
52
  model_path=gguf_path,
 
53
  n_ctx=N_CTX,
54
  n_threads=N_THREADS,
55
  n_gpu_layers=N_GPU_LAYERS,
 
58
  )
59
  return _llm
60
 
61
+ def _format_chatml(messages):
62
+ parts = []
63
+ for m in messages:
64
+ parts.append(f"<|im_start|>{m.get('role','user')}\n{m.get('content','')}<|im_end|>\n")
65
+ parts.append("<|im_start|>assistant\n")
66
+ return "".join(parts)
67
+
68
+ def r1_chat_local(messages, temperature=0.2, max_tokens=384):
69
+ # llama.cpp acepta messages directamente; si tu build no, usa prompt=_format_chatml(messages)
70
+ llm = get_llm()
71
+ out = llm.create_chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens)
72
+ return out["choices"][0]["message"]["content"]
73
+
74
+ # Warmup opcional
75
+ if os.getenv("WARMUP", "0") == "1":
76
+ try: get_llm()
77
+ except Exception: pass
78
 
79
  # ===============================================================
80
+ # DeepSeek-OCR (intacto) con fallback si no hay FlashAttention2
81
  # ===============================================================
82
  def _best_dtype():
83
  if torch.cuda.is_available():
 
87
  def _load_ocr_model():
88
  model_name = "deepseek-ai/DeepSeek-OCR"
89
  ocr_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
90
+ attn_impl = os.getenv("OCR_ATTN_IMPL", "flash_attention_2")
91
  try:
92
  ocr_model = AutoModel.from_pretrained(
93
+ model_name, _attn_implementation=attn_impl, trust_remote_code=True, use_safetensors=True
 
 
 
94
  ).eval()
95
  return ocr_tokenizer, ocr_model
96
  except Exception as e:
97
+ if any(k in str(e).lower() for k in ["flash_attn", "flashattention2", "flash_attention_2"]):
 
 
98
  ocr_model = AutoModel.from_pretrained(
99
+ model_name, _attn_implementation="eager", trust_remote_code=True, use_safetensors=True
 
 
 
100
  ).eval()
101
  return ocr_tokenizer, ocr_model
102
  raise
 
105
 
106
  @spaces.GPU
107
  def process_image(image, model_size, task_type, is_eval_mode):
 
 
 
108
  if image is None:
109
  return None, "Please upload an image first.", "Please upload an image first."
110
  dtype = _best_dtype()
111
  model_device = model.cuda().to(dtype) if torch.cuda.is_available() else model.to(dtype)
112
 
113
  with tempfile.TemporaryDirectory() as output_path:
114
+ prompt = "<image>\nFree OCR. " if task_type == "Free OCR" else "<image>\n<|grounding|>Convert the document to markdown. "
 
 
 
 
 
 
115
  temp_image_path = os.path.join(output_path, "temp_image.jpg")
116
  image.save(temp_image_path)
117
 
 
140
  image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
141
  markdown_result_path = os.path.join(output_path, "result.mmd")
142
 
143
+ markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
144
  if os.path.exists(markdown_result_path):
145
  with open(markdown_result_path, "r", encoding="utf-8") as f:
146
  markdown_content = f.read()
 
 
147
 
148
  result_image = None
149
  if os.path.exists(image_result_path):
150
+ result_image = Image.open(image_result_path); result_image.load()
 
151
 
152
  text_result = plain_text_result if plain_text_result else markdown_content
153
  return result_image, markdown_content, text_result
154
 
155
  # ===============================================================
156
+ # Chat (inyecta OCR) — con R1 local
157
  # ===============================================================
158
+ def _truncate(text, max_chars=3000): return (text or "")[:max_chars]
 
159
 
160
  def _system_prompt():
161
+ return ("Eres un asistente clínico educativo. No sustituyes el juicio médico. "
162
+ "Usa CONTEXTO_OCR si existe; si falta, pídelo. Evita diagnósticos definitivos.")
 
 
163
 
164
+ def _ocr_context(ocr_md, ocr_txt): return _truncate(ocr_md) or _truncate(ocr_txt) or ""
 
165
 
166
  def to_chat_messages(chat_msgs, ocr_md, ocr_txt):
167
  sys = _system_prompt()
168
  ctx = _ocr_context(ocr_md, ocr_txt)
169
  if ctx:
170
+ sys += ("\n\n---\n"
171
+ "CONTEXTO_OCR (fuente principal; si falta un dato, dilo explícitamente):\n"
172
+ f"{ctx}\n---")
 
 
173
  msgs = [{"role": "system", "content": sys}]
174
  for m in (chat_msgs or []):
175
  if m.get("role") in ("user", "assistant"):
 
181
  user_msg = "Analiza el CONTEXTO_OCR anterior y responde a partir de ese contenido."
182
  try:
183
  msgs = to_chat_messages(chat_msgs, ocr_md, ocr_txt) + [{"role": "user", "content": user_msg}]
184
+ answer = r1_chat_local(msgs, temperature=0.2, max_tokens=512)
185
+ updated = (chat_msgs or []) + [{"role": "user", "content": user_msg},
186
+ {"role": "assistant", "content": answer}]
 
 
187
  return updated, "", gr.update(value="")
188
  except Exception as e:
189
  err = f"{e.__class__.__name__}: {str(e) or repr(e)}"
190
  tb = traceback.format_exc(limit=2)
191
+ updated = (chat_msgs or []) + [{"role": "user", "content": user_msg or ""},
192
+ {"role": "assistant", "content": f"⚠️ Error LLM: {err}"}]
 
 
193
  return updated, "", gr.update(value=f"{err}\n{tb}")
194
 
195
+ def clear_chat(): return [], "", gr.update(value="")
 
196
 
197
  # ===============================================================
198
  # UI (Gradio 5)
199
  # ===============================================================
200
+ with gr.Blocks(title="DeepSeek-OCR + R1 Medical (GGUF rápido)", theme=gr.themes.Soft()) as demo:
201
  gr.Markdown(
202
  """
203
+ # DeepSeek-OCR → Chat Médico con **DeepSeek-R1 Medical Mini (GGUF local rápido)**
204
  1) **Sube una imagen** y corre **OCR** (imagen anotada, Markdown y texto).
205
+ 2) **Chatea** con **R1 Medical Mini** usando automáticamente el **OCR** como contexto.
206
  *Uso educativo; no reemplaza consejo médico.*
207
  """
208
  )
 
213
  with gr.Row():
214
  with gr.Column(scale=1):
215
  image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard", "webcam"])
216
+ model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
217
+ value="Gundam (Recommended)", label="Model Size")
218
+ task_type = gr.Dropdown(choices=["Free OCR", "Convert to Markdown"],
219
+ value="Convert to Markdown", label="Task Type")
220
+ eval_mode_checkbox = gr.Checkbox(value=False, label="Enable Evaluation Mode",
221
+ info="Solo texto (más rápido). Desmárcalo para ver imagen anotada y markdown.")
 
 
 
 
 
 
222
  submit_btn = gr.Button("Process Image", variant="primary")
223
 
224
  with gr.Column(scale=2):
 
231
  md_preview = gr.Textbox(label="Snapshot Markdown OCR", lines=10, interactive=False)
232
  txt_preview = gr.Textbox(label="Snapshot Texto OCR", lines=10, interactive=False)
233
 
234
+ gr.Markdown("## Chat Clínico (R1 Medical Mini — GGUF local)")
235
  with gr.Row():
236
  with gr.Column(scale=2):
237
+ chatbot = gr.Chatbot(label="Asistente OCR (R1 GGUF)", type="messages", height=420)
238
  user_in = gr.Textbox(label="Mensaje", placeholder="Escribe tu consulta… (vacío = analiza solo el OCR)", lines=2)
239
  with gr.Row():
240
  send_btn = gr.Button("Enviar", variant="primary")
 
242
  with gr.Column(scale=1):
243
  error_box = gr.Textbox(label="Debug (si hay error)", lines=8, interactive=False)
244
 
 
245
  submit_btn.click(
246
  fn=process_image,
247
  inputs=[image_input, model_size, task_type, eval_mode_checkbox],
 
252
  outputs=[ocr_md_state, ocr_txt_state, md_preview, txt_preview],
253
  )
254
 
255
+ send_btn.click(fn=r1_reply, inputs=[user_in, chatbot, ocr_md_state, ocr_txt_state],
256
+ outputs=[chatbot, user_in, error_box])
 
 
 
 
257
  clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, error_box])
258
 
259
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,8 +1,7 @@
1
- # --- Core runtime ---
2
  gradio==5.49.1
3
  spaces>=0.28.3
4
 
5
- # PyTorch + Transformers
6
  torch==2.6.0
7
  torchvision==0.21.0
8
  transformers==4.46.3
@@ -12,7 +11,7 @@ safetensors>=0.4.5
12
  huggingface-hub>=0.30.0
13
  hf-transfer>=0.1.6
14
 
15
- # Vision / utils
16
  pillow>=10.4.0
17
  numpy>=1.26.0
18
  tqdm>=4.66.4
@@ -25,9 +24,9 @@ pydantic==2.10.6
25
  protobuf<4
26
  click<8.1
27
 
28
- # Llama.cpp (GGUF local para el chat si R1_REMOTE=0)
29
  llama-cpp-python==0.2.90
30
 
31
- # --- Opcional (GPU para acelerar el OCR con flash_attention_2) ---
32
  # flash-attn==2.7.3 --no-build-isolation
33
  # xformers==0.0.28.post1
 
 
1
  gradio==5.49.1
2
  spaces>=0.28.3
3
 
4
+ # PyTorch + Transformers (para DeepSeek-OCR)
5
  torch==2.6.0
6
  torchvision==0.21.0
7
  transformers==4.46.3
 
11
  huggingface-hub>=0.30.0
12
  hf-transfer>=0.1.6
13
 
14
+ # Utils y visión
15
  pillow>=10.4.0
16
  numpy>=1.26.0
17
  tqdm>=4.66.4
 
24
  protobuf<4
25
  click<8.1
26
 
27
+ # LLM local (GGUF)
28
  llama-cpp-python==0.2.90
29
 
30
+ # (Opcional GPU para acelerar OCR; en CPU/Zero no instales)
31
  # flash-attn==2.7.3 --no-build-isolation
32
  # xformers==0.0.28.post1