jorgeiv500 commited on
Commit
2c7042c
·
1 Parent(s): 2681cf8
Files changed (3) hide show
  1. README.md +47 -31
  2. app.py +248 -99
  3. requirements.txt +30 -12
README.md CHANGED
@@ -1,43 +1,59 @@
1
  ---
2
- title: DeepSeek OCR Demo
3
- emoji: 🖼
4
- colorFrom: purple
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.44.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
- short_description: An interactive demo for the DeepSeek-OCR model.
12
  ---
13
 
14
- # DeepSeek-OCR Document Recognition
15
 
16
- This Space uses the DeepSeek-OCR model for document text recognition and extraction.
 
 
17
 
18
- ## Features
19
 
20
- - Multiple model size options (Tiny to Large)
21
- - Free OCR and Markdown conversion
22
- - Support for various document types
23
- - Powered by ZeroGPU for efficient inference
24
-
25
- ## Usage
26
-
27
- 1. Upload an image containing text
28
- 2. Select model size (Gundam recommended for documents)
29
- 3. Choose task type
30
- 4. Click "Process Image"
31
-
32
- ## Model Sizes
33
-
34
- - **Tiny**: 512x512, fastest
35
- - **Small**: 640x640, good balance
36
- - **Base**: 1024x1024, high quality
37
- - **Large**: 1280x1280, best quality
38
- - **Gundam**: Optimized for documents with crop mode
39
 
40
- ## Credits
 
 
 
 
 
 
41
 
42
- Model: [deepseek-ai/DeepSeek-OCR](https://huggingface.co/deepseek-ai/DeepSeek-OCR)
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: OpScan.IA — DeepSeek-OCR + DeepSeek-R1 Medical Mini
3
+ emoji: 🩺
4
+ colorFrom: gray
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
+ # OpScan.IA — DeepSeek-OCR + DeepSeek-R1 Medical Mini
13
 
14
+ Aplicación en **Gradio** que:
15
+ 1) Extrae texto y marcas de un documento/imagen con **DeepSeek-OCR**.
16
+ 2) Inyecta automáticamente ese OCR como **contexto** para chatear con **DeepSeek-R1 Medical Mini** (remoto o GGUF local).
17
 
18
+ > **Uso educativo**. No sustituye criterio clínico ni diagnóstico profesional.
19
 
20
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ ## ✨ Características
23
+ - **OCR**: cajas, Markdown y/o texto plano a partir de imágenes (upload/clipboard/cámara).
24
+ - **Chat clínico**: el LLM recibe el OCR como *system context* y responde con cautela.
25
+ - **Modos del chat**:
26
+ - **Remoto (HF Inference)**: `R1_REMOTE=1` (sin token si el modelo es público).
27
+ - **Local GGUF (CPU/Zero)**: `R1_REMOTE=0` con `llama.cpp`.
28
+ - **Tolerante a entorno**: si el OCR falla por `FlashAttention2`, cae a `_attn_implementation="eager"` automáticamente.
29
 
30
+ ---
31
 
32
+ ## 📦 Requisitos
33
+
34
+ `requirements.txt`:
35
+
36
+ ```txt
37
+ gradio==5.49.1
38
+ spaces>=0.28.3
39
+ torch==2.6.0
40
+ torchvision==0.21.0
41
+ transformers==4.46.3
42
+ tokenizers==0.20.3
43
+ accelerate>=0.34.2
44
+ safetensors>=0.4.5
45
+ huggingface-hub>=0.30.0
46
+ hf-transfer>=0.1.6
47
+ pillow>=10.4.0
48
+ numpy>=1.26.0
49
+ tqdm>=4.66.4
50
+ requests>=2.31.0
51
+ einops>=0.7.0
52
+ addict>=2.4.0
53
+ easydict>=1.13
54
+ sentencepiece>=0.2.0
55
+ pydantic==2.10.6
56
+ protobuf<4
57
+ click<8.1
58
+ llama-cpp-python==0.2.90
59
+ # (Opcional GPU) flash-attn / xformers
app.py CHANGED
@@ -1,47 +1,164 @@
 
 
1
  import gradio as gr
2
  import torch
 
3
  from transformers import AutoModel, AutoTokenizer
4
  import spaces
5
- import os
6
- import tempfile
7
- from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # Load model and tokenizer
10
- model_name = "deepseek-ai/DeepSeek-OCR"
11
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
12
- model = AutoModel.from_pretrained(
13
- model_name,
14
- _attn_implementation="flash_attention_2",
15
- trust_remote_code=True,
16
- use_safetensors=True,
17
- )
18
- model = model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  @spaces.GPU
22
  def process_image(image, model_size, task_type, is_eval_mode):
23
  """
24
- Process image with DeepSeek-OCR and return multiple output formats.
25
-
26
- Args:
27
- image: PIL Image or file path
28
- model_size: Model size configuration
29
- task_type: OCR task type
30
-
31
- Returns:
32
- A tuple containing:
33
- - Path to the image with bounding boxes.
34
- - The content of the markdown result file.
35
- - The plain text OCR result.
36
  """
37
  if image is None:
38
  return None, "Please upload an image first.", "Please upload an image first."
 
 
39
 
40
- model_gpu = model.cuda().to(torch.bfloat16)
41
-
42
- # Create temporary directory for output
43
  with tempfile.TemporaryDirectory() as output_path:
44
- # Set prompt based on task type
45
  if task_type == "Free OCR":
46
  prompt = "<image>\nFree OCR. "
47
  elif task_type == "Convert to Markdown":
@@ -49,27 +166,19 @@ def process_image(image, model_size, task_type, is_eval_mode):
49
  else:
50
  prompt = "<image>\nFree OCR. "
51
 
52
- # Save uploaded image temporarily
53
  temp_image_path = os.path.join(output_path, "temp_image.jpg")
54
  image.save(temp_image_path)
55
 
56
- # Configure model size parameters
57
  size_configs = {
58
  "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
59
  "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
60
  "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
61
  "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
62
- "Gundam (Recommended)": {
63
- "base_size": 1024,
64
- "image_size": 640,
65
- "crop_mode": True,
66
- },
67
  }
68
-
69
  config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
70
 
71
- # Run inference
72
- plain_text_result = model_gpu.infer(
73
  tokenizer,
74
  prompt=prompt,
75
  image_file=temp_image_path,
@@ -77,114 +186,154 @@ def process_image(image, model_size, task_type, is_eval_mode):
77
  base_size=config["base_size"],
78
  image_size=config["image_size"],
79
  crop_mode=config["crop_mode"],
80
- save_results=True, # Ensure results are saved to disk
81
  test_compress=True,
82
  eval_mode=is_eval_mode,
83
  )
84
 
85
- # Define paths for the generated files
86
  image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
87
  markdown_result_path = os.path.join(output_path, "result.mmd")
88
 
89
- # Read the markdown file content if it exists
90
- markdown_content = ""
91
  if os.path.exists(markdown_result_path):
92
  with open(markdown_result_path, "r", encoding="utf-8") as f:
93
  markdown_content = f.read()
94
  else:
95
  markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
96
 
97
-
98
  result_image = None
99
- # Check if the annotated image exists
100
  if os.path.exists(image_result_path):
101
  result_image = Image.open(image_result_path)
102
  result_image.load()
103
 
104
- # Return all three results. Gradio will handle the temporary file path for the image.
105
  text_result = plain_text_result if plain_text_result else markdown_content
106
  return result_image, markdown_content, text_result
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- # Create Gradio interface
110
- with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo:
 
 
111
  gr.Markdown(
112
  """
113
- # DeepSeek-OCR Demo
114
-
115
- Upload an image to extract text using DeepSeek-OCR model.
116
- Supports various document types and handwriting recognition.
117
-
118
- **Model Sizes:**
119
- - **Tiny**: Fastest, lower accuracy (512x512)
120
- - **Small**: Fast, good accuracy (640x640)
121
- - **Base**: Balanced performance (1024x1024)
122
- - **Large**: Best accuracy, slower (1280x1280)
123
- - **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
124
  """
125
  )
126
 
 
 
 
127
  with gr.Row():
128
  with gr.Column(scale=1):
129
- image_input = gr.Image(
130
- type="pil", label="Upload Image", sources=["upload", "clipboard"]
131
- )
132
-
133
  model_size = gr.Dropdown(
134
  choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
135
- value="Gundam (Recommended)",
136
- label="Model Size",
137
  )
138
-
139
  task_type = gr.Dropdown(
140
  choices=["Free OCR", "Convert to Markdown"],
141
- value="Convert to Markdown",
142
- label="Task Type",
143
  )
144
-
145
  eval_mode_checkbox = gr.Checkbox(
146
- value=False,
147
- label="Enable Evaluation Mode",
148
- info="Returns only plain text, but might be faster. Uncheck to get annotated image and markdown.",
149
  )
150
-
151
  submit_btn = gr.Button("Process Image", variant="primary")
152
 
153
  with gr.Column(scale=2):
154
  with gr.Tabs():
155
- with gr.TabItem("Annotated Image"):
156
- output_image = gr.Image(
157
- interactive=False
158
- )
159
- with gr.TabItem("Markdown Preview"):
160
- output_markdown = gr.Markdown()
161
- with gr.TabItem("Markdown Source(or Eval Output)"):
162
- output_text = gr.Textbox(
163
- lines=20,
164
- show_copy_button=True,
165
- interactive=False,
166
- )
167
-
168
- # Examples
169
- gr.Examples(
170
- examples=[
171
- ["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
172
- ["examples/receipt.jpg", "Base", "Convert to Markdown"],
173
- ["examples/receipt-2.png", "Base", "Convert to Markdown"],
174
- ],
175
- inputs=[image_input, model_size, task_type, eval_mode_checkbox],
176
- outputs=[output_image, output_markdown, output_text],
177
- fn=process_image,
178
- cache_examples=True,
179
- )
180
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  submit_btn.click(
182
  fn=process_image,
183
  inputs=[image_input, model_size, task_type, eval_mode_checkbox],
184
  outputs=[output_image, output_markdown, output_text],
 
 
 
 
 
 
 
 
 
 
 
185
  )
 
186
 
187
- # Launch the app
188
  if __name__ == "__main__":
189
  demo.queue(max_size=20)
190
  demo.launch()
 
1
+ # app.py — DeepSeek-OCR + DeepSeek-R1 Medical Mini (remoto HF o local GGUF) — Gradio 5
2
+ import os, tempfile, traceback
3
  import gradio as gr
4
  import torch
5
+ from PIL import Image
6
  from transformers import AutoModel, AutoTokenizer
7
  import spaces
8
+ from huggingface_hub import hf_hub_download, InferenceClient
9
+ from llama_cpp import Llama
10
+
11
+ # ===============================================================
12
+ # Configuración LLM (CHAT) — DeepSeek-R1 Medical Mini
13
+ # - Remoto (HF Inference): R1_REMOTE=1 y (opcional) R1_MODEL_ID, HF_TOKEN
14
+ # - Local GGUF (CPU/Zero): R1_REMOTE=0 y GGUF_REPO / GGUF_FILE
15
+ # ===============================================================
16
+ R1_REMOTE = os.getenv("R1_REMOTE", "0") == "1"
17
+ R1_MODEL_ID = os.getenv("R1_MODEL_ID", "Mouhib007/DeepSeek-r1-Medical-Mini")
18
+ HF_TOKEN = os.getenv("HF_TOKEN") # público -> puede ser None
19
+
20
+ # ---- Local GGUF (fallback / modo offline) ----
21
+ GGUF_CANDIDATES = []
22
+ ENV_REPO = os.getenv("GGUF_REPO", "").strip()
23
+ ENV_FILE = os.getenv("GGUF_FILE", "").strip()
24
+ if ENV_REPO and ENV_FILE:
25
+ GGUF_CANDIDATES.append((ENV_REPO, ENV_FILE))
26
+ # Candidato por defecto (ajústalo si usas otro)
27
+ GGUF_CANDIDATES.append((
28
+ "mradermacher/DeepSeek-r1-Medical-Mini-GGUF",
29
+ "DeepSeek-r1-Medical-Mini.f16.gguf"
30
+ ))
31
+
32
+ N_CTX = int(os.getenv("N_CTX", "2048"))
33
+ N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
34
+ N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0"))
35
+ N_BATCH = int(os.getenv("N_BATCH", "96"))
36
 
37
+ # ---- Cliente remoto (HF Inference) ----
38
+ _remote_client = None
39
+ def get_remote_client():
40
+ global _remote_client
41
+ if _remote_client is None:
42
+ _remote_client = InferenceClient(model=R1_MODEL_ID, token=HF_TOKEN, timeout=60)
43
+ return _remote_client
44
+
45
+ # ---- Formato ChatML (compatible con DeepSeek/Qwen) ----
46
+ def _format_chatml(messages):
47
+ parts = []
48
+ for m in messages:
49
+ role = m.get("role", "user")
50
+ content = m.get("content", "")
51
+ parts.append(f"<|im_start|>{role}\n{content}<|im_end|>\n")
52
+ parts.append("<|im_start|>assistant\n")
53
+ return "".join(parts)
54
+
55
+ def r1_chat(messages, temperature=0.2, max_tokens=384):
56
+ """Remoto (HF) o local (llama-cpp) para DeepSeek-R1 Medical Mini."""
57
+ if R1_REMOTE:
58
+ client = get_remote_client()
59
+ try:
60
+ # Algunos endpoints soportan chat_completion
61
+ resp = client.chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens)
62
+ return resp.choices[0].message["content"]
63
+ except Exception:
64
+ # Fallback universal a text_generation con ChatML
65
+ try:
66
+ prompt = _format_chatml(messages)
67
+ return client.text_generation(
68
+ prompt,
69
+ max_new_tokens=max_tokens,
70
+ temperature=temperature,
71
+ stop_sequences=["<|im_end|>"],
72
+ stream=False,
73
+ )
74
+ except Exception:
75
+ # Si remoto falla (401/429/etc), caemos a local si hay GGUF
76
+ pass
77
+ # Local GGUF
78
+ llm = get_llm()
79
+ out = llm.create_chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens)
80
+ return out["choices"][0]["message"]["content"]
81
+
82
+ # ---- Loader local (GGUF) ----
83
+ _llm = None
84
+ def _download_gguf():
85
+ last_err = None
86
+ for repo, fname in GGUF_CANDIDATES:
87
+ try:
88
+ return hf_hub_download(repo_id=repo, filename=fname), repo, fname
89
+ except Exception as e:
90
+ last_err = e
91
+ raise RuntimeError(f"No se pudo descargar ningún GGUF. Último error: {last_err}")
92
+
93
+ def get_llm():
94
+ global _llm
95
+ if _llm is not None:
96
+ return _llm
97
+ gguf_path, _, _ = _download_gguf()
98
+ _llm = Llama(
99
+ model_path=gguf_path,
100
+ # No forzamos chat_format; usamos el del GGUF del R1
101
+ n_ctx=N_CTX,
102
+ n_threads=N_THREADS,
103
+ n_gpu_layers=N_GPU_LAYERS,
104
+ n_batch=N_BATCH,
105
+ verbose=False,
106
+ )
107
+ return _llm
108
 
109
+ # Warmup opcional (para no esperar en el primer mensaje si usas local)
110
+ if os.getenv("WARMUP", "0") == "1" and not R1_REMOTE:
111
+ try:
112
+ get_llm()
113
+ except Exception:
114
+ pass
115
+
116
+ # ===============================================================
117
+ # DeepSeek-OCR (INTACTO — con fallback si no hay FlashAttention2)
118
+ # ===============================================================
119
+ def _best_dtype():
120
+ if torch.cuda.is_available():
121
+ return torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
122
+ return torch.float32
123
+
124
+ def _load_ocr_model():
125
+ model_name = "deepseek-ai/DeepSeek-OCR"
126
+ ocr_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
127
+ attn_impl = os.getenv("OCR_ATTN_IMPL", "flash_attention_2") # por defecto igual que antes
128
+ try:
129
+ ocr_model = AutoModel.from_pretrained(
130
+ model_name,
131
+ _attn_implementation=attn_impl,
132
+ trust_remote_code=True,
133
+ use_safetensors=True,
134
+ ).eval()
135
+ return ocr_tokenizer, ocr_model
136
+ except Exception as e:
137
+ # Si falla por FlashAttention2, reintenta en modo "eager" (CPU/compat)
138
+ msg = str(e)
139
+ if "flash_attn" in msg or "FlashAttention2" in msg or "flash_attention_2" in msg:
140
+ ocr_model = AutoModel.from_pretrained(
141
+ model_name,
142
+ _attn_implementation="eager",
143
+ trust_remote_code=True,
144
+ use_safetensors=True,
145
+ ).eval()
146
+ return ocr_tokenizer, ocr_model
147
+ raise
148
+
149
+ tokenizer, model = _load_ocr_model()
150
 
151
  @spaces.GPU
152
  def process_image(image, model_size, task_type, is_eval_mode):
153
  """
154
+ Devuelve: imagen anotada, markdown y texto (o markdown si no hay texto).
 
 
 
 
 
 
 
 
 
 
 
155
  """
156
  if image is None:
157
  return None, "Please upload an image first.", "Please upload an image first."
158
+ dtype = _best_dtype()
159
+ model_device = model.cuda().to(dtype) if torch.cuda.is_available() else model.to(dtype)
160
 
 
 
 
161
  with tempfile.TemporaryDirectory() as output_path:
 
162
  if task_type == "Free OCR":
163
  prompt = "<image>\nFree OCR. "
164
  elif task_type == "Convert to Markdown":
 
166
  else:
167
  prompt = "<image>\nFree OCR. "
168
 
 
169
  temp_image_path = os.path.join(output_path, "temp_image.jpg")
170
  image.save(temp_image_path)
171
 
 
172
  size_configs = {
173
  "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
174
  "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
175
  "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
176
  "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
177
+ "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
 
 
 
 
178
  }
 
179
  config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
180
 
181
+ plain_text_result = model_device.infer(
 
182
  tokenizer,
183
  prompt=prompt,
184
  image_file=temp_image_path,
 
186
  base_size=config["base_size"],
187
  image_size=config["image_size"],
188
  crop_mode=config["crop_mode"],
189
+ save_results=True,
190
  test_compress=True,
191
  eval_mode=is_eval_mode,
192
  )
193
 
 
194
  image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
195
  markdown_result_path = os.path.join(output_path, "result.mmd")
196
 
 
 
197
  if os.path.exists(markdown_result_path):
198
  with open(markdown_result_path, "r", encoding="utf-8") as f:
199
  markdown_content = f.read()
200
  else:
201
  markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
202
 
 
203
  result_image = None
 
204
  if os.path.exists(image_result_path):
205
  result_image = Image.open(image_result_path)
206
  result_image.load()
207
 
 
208
  text_result = plain_text_result if plain_text_result else markdown_content
209
  return result_image, markdown_content, text_result
210
 
211
+ # ===============================================================
212
+ # Chat (inyecta OCR en el primer system) — usando R1
213
+ # ===============================================================
214
+ def _truncate(text, max_chars=3000):
215
+ return (text or "")[:max_chars]
216
+
217
+ def _system_prompt():
218
+ return (
219
+ "Eres un asistente clínico educativo. No sustituyes el juicio médico. "
220
+ "Usa CONTEXTO_OCR si existe; si falta, pídelo. Evita diagnósticos definitivos."
221
+ )
222
+
223
+ def _ocr_context(ocr_md, ocr_txt):
224
+ return _truncate(ocr_md) or _truncate(ocr_txt) or ""
225
+
226
+ def to_chat_messages(chat_msgs, ocr_md, ocr_txt):
227
+ sys = _system_prompt()
228
+ ctx = _ocr_context(ocr_md, ocr_txt)
229
+ if ctx:
230
+ sys += (
231
+ "\n\n---\n"
232
+ "CONTEXTO_OCR (fuente principal; si falta un dato, dilo explícitamente):\n"
233
+ f"{ctx}\n---"
234
+ )
235
+ msgs = [{"role": "system", "content": sys}]
236
+ for m in (chat_msgs or []):
237
+ if m.get("role") in ("user", "assistant"):
238
+ msgs.append({"role": m["role"], "content": m.get("content", "")})
239
+ return msgs
240
+
241
+ def r1_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
242
+ if not user_msg:
243
+ user_msg = "Analiza el CONTEXTO_OCR anterior y responde a partir de ese contenido."
244
+ try:
245
+ msgs = to_chat_messages(chat_msgs, ocr_md, ocr_txt) + [{"role": "user", "content": user_msg}]
246
+ answer = r1_chat(msgs, temperature=0.2, max_tokens=512)
247
+ updated = (chat_msgs or []) + [
248
+ {"role": "user", "content": user_msg},
249
+ {"role": "assistant", "content": answer},
250
+ ]
251
+ return updated, "", gr.update(value="")
252
+ except Exception as e:
253
+ err = f"{e.__class__.__name__}: {str(e) or repr(e)}"
254
+ tb = traceback.format_exc(limit=2)
255
+ updated = (chat_msgs or []) + [
256
+ {"role": "user", "content": user_msg or ""},
257
+ {"role": "assistant", "content": f"⚠️ Error LLM: {err}"},
258
+ ]
259
+ return updated, "", gr.update(value=f"{err}\n{tb}")
260
+
261
+ def clear_chat():
262
+ return [], "", gr.update(value="")
263
 
264
+ # ===============================================================
265
+ # UI (Gradio 5)
266
+ # ===============================================================
267
+ with gr.Blocks(title="DeepSeek-OCR + DeepSeek-R1 Medical Mini", theme=gr.themes.Soft()) as demo:
268
  gr.Markdown(
269
  """
270
+ # DeepSeek-OCR → Chat Médico con **DeepSeek-R1 Medical Mini** (remoto HF o local GGUF)
271
+ 1) **Sube una imagen** y corre **OCR** (imagen anotada, Markdown y texto).
272
+ 2) **Chatea** con **DeepSeek-R1 Medical Mini** usando automáticamente el **OCR** como contexto.
273
+ *Uso educativo; no reemplaza consejo médico.*
 
 
 
 
 
 
 
274
  """
275
  )
276
 
277
+ ocr_md_state = gr.State("")
278
+ ocr_txt_state = gr.State("")
279
+
280
  with gr.Row():
281
  with gr.Column(scale=1):
282
+ image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard", "webcam"])
 
 
 
283
  model_size = gr.Dropdown(
284
  choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
285
+ value="Gundam (Recommended)", label="Model Size",
 
286
  )
 
287
  task_type = gr.Dropdown(
288
  choices=["Free OCR", "Convert to Markdown"],
289
+ value="Convert to Markdown", label="Task Type",
 
290
  )
 
291
  eval_mode_checkbox = gr.Checkbox(
292
+ value=False, label="Enable Evaluation Mode",
293
+ info="Solo texto (más rápido). Desmárcalo para ver imagen anotada y markdown.",
 
294
  )
 
295
  submit_btn = gr.Button("Process Image", variant="primary")
296
 
297
  with gr.Column(scale=2):
298
  with gr.Tabs():
299
+ with gr.TabItem("Annotated Image"): output_image = gr.Image(interactive=False)
300
+ with gr.TabItem("Markdown Preview"): output_markdown = gr.Markdown()
301
+ with gr.TabItem("Markdown Source (or Eval Output)"):
302
+ output_text = gr.Textbox(lines=18, show_copy_button=True, interactive=False)
303
+ with gr.Row():
304
+ md_preview = gr.Textbox(label="Snapshot Markdown OCR", lines=10, interactive=False)
305
+ txt_preview = gr.Textbox(label="Snapshot Texto OCR", lines=10, interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
+ gr.Markdown("## Chat Clínico (DeepSeek-R1 Medical Mini)")
308
+ with gr.Row():
309
+ with gr.Column(scale=2):
310
+ chatbot = gr.Chatbot(label="Asistente OCR (R1 Medical Mini)", type="messages", height=420)
311
+ user_in = gr.Textbox(label="Mensaje", placeholder="Escribe tu consulta… (vacío = analiza solo el OCR)", lines=2)
312
+ with gr.Row():
313
+ send_btn = gr.Button("Enviar", variant="primary")
314
+ clear_btn = gr.Button("Limpiar")
315
+ with gr.Column(scale=1):
316
+ error_box = gr.Textbox(label="Debug (si hay error)", lines=8, interactive=False)
317
+
318
+ # OCR → outputs y estados
319
  submit_btn.click(
320
  fn=process_image,
321
  inputs=[image_input, model_size, task_type, eval_mode_checkbox],
322
  outputs=[output_image, output_markdown, output_text],
323
+ ).then(
324
+ fn=lambda md, tx: (md, tx, md, tx),
325
+ inputs=[output_markdown, output_text],
326
+ outputs=[ocr_md_state, ocr_txt_state, md_preview, txt_preview],
327
+ )
328
+
329
+ # Chat
330
+ send_btn.click(
331
+ fn=r1_reply,
332
+ inputs=[user_in, chatbot, ocr_md_state, ocr_txt_state],
333
+ outputs=[chatbot, user_in, error_box],
334
  )
335
+ clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, error_box])
336
 
 
337
  if __name__ == "__main__":
338
  demo.queue(max_size=20)
339
  demo.launch()
requirements.txt CHANGED
@@ -1,15 +1,33 @@
 
 
 
 
 
1
  torch==2.6.0
 
2
  transformers==4.46.3
3
  tokenizers==0.20.3
4
- einops
5
- addict
6
- easydict
7
- gradio>=4.0.0
8
- spaces>=0.20.0
9
- Pillow>=10.0.0
10
- safetensors>=0.4.0
11
- accelerate>=0.24.0
12
- sentencepiece>=0.1.99
13
- protobuf>=3.20.0
14
- torchvision
15
- flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Core runtime ---
2
+ gradio==5.49.1
3
+ spaces>=0.28.3
4
+
5
+ # PyTorch + Transformers
6
  torch==2.6.0
7
+ torchvision==0.21.0
8
  transformers==4.46.3
9
  tokenizers==0.20.3
10
+ accelerate>=0.34.2
11
+ safetensors>=0.4.5
12
+ huggingface-hub>=0.30.0
13
+ hf-transfer>=0.1.6
14
+
15
+ # Vision / utils
16
+ pillow>=10.4.0
17
+ numpy>=1.26.0
18
+ tqdm>=4.66.4
19
+ requests>=2.31.0
20
+ einops>=0.7.0
21
+ addict>=2.4.0
22
+ easydict>=1.13
23
+ sentencepiece>=0.2.0
24
+ pydantic==2.10.6
25
+ protobuf<4
26
+ click<8.1
27
+
28
+ # Llama.cpp (GGUF local para el chat si R1_REMOTE=0)
29
+ llama-cpp-python==0.2.90
30
+
31
+ # --- Opcional (GPU para acelerar el OCR con flash_attention_2) ---
32
+ # flash-attn==2.7.3 --no-build-isolation
33
+ # xformers==0.0.28.post1