pnnbao-ump commited on
Commit
1fcb13f
·
verified ·
1 Parent(s): c4a9342

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -148
app.py CHANGED
@@ -10,56 +10,48 @@ from PIL import Image, ImageOps
10
  import fitz
11
  import re
12
  import time
13
- from threading import Thread
14
- from queue import Queue
15
  from io import StringIO, BytesIO
16
  import spaces
17
 
18
- # ==================== DEEPSEEK OCR SETUP ====================
19
  OCR_MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'
 
 
 
 
20
 
 
21
  print("🔄 Loading OCR model...")
22
  ocr_tokenizer = AutoTokenizer.from_pretrained(OCR_MODEL_NAME, trust_remote_code=True)
23
-
24
  try:
25
  ocr_model = AutoModel.from_pretrained(
26
- OCR_MODEL_NAME,
27
- attn_implementation='flash_attention_2',
28
- torch_dtype=torch.bfloat16,
29
- trust_remote_code=True,
30
  use_safetensors=True
31
  )
32
  print("✅ Using Flash Attention 2")
33
  except (ImportError, ValueError):
34
  print("⚠️ Flash Attention 2 not available, using eager attention")
35
  ocr_model = AutoModel.from_pretrained(
36
- OCR_MODEL_NAME,
37
- attn_implementation='eager',
38
- torch_dtype=torch.bfloat16,
39
- trust_remote_code=True,
40
  use_safetensors=True
41
  )
42
-
43
- # Don't move model to GPU here - let @spaces.GPU handle it
44
  ocr_model = ocr_model.eval()
45
 
46
- MODEL_CONFIGS = {
47
- "Crab": {"base_size": 1024, "image_size": 640, "crop_mode": True},
48
- "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
49
- }
50
-
51
- # ==================== MEDCRAB TRANSLATOR SETUP ====================
52
  print("🦀 Loading MedCrab translator...")
53
  device = "cuda" if torch.cuda.is_available() else "cpu"
54
  translator = MedCrabTranslator(device=device)
55
  print(f"✅ MedCrab translator loaded on {device}")
56
 
57
- # ==================== TEXT CLEANING FUNCTIONS ====================
58
  def clean_mathrm(text):
59
- """Chuyển đổi LaTeX sang HTML với subscript/superscript chỉ trong môi trường toán học"""
60
  if not text:
61
  return ""
62
-
63
  def process_math_block(match):
64
  math_content = match.group(1)
65
  math_content = re.sub(r'\\mathrm\{([^}]*)\}', r'\1', math_content)
@@ -67,7 +59,6 @@ def clean_mathrm(text):
67
  math_content = re.sub(r'\^([A-Za-z0-9+\-]+)', r'<sup>\1</sup>', math_content)
68
  math_content = re.sub(r'_\{([^}]+)\}', r'<sub>\1</sub>', math_content)
69
  math_content = re.sub(r'_([A-Za-z0-9+\-]+)', r'<sub>\1</sub>', math_content)
70
-
71
  replacements = {
72
  r'\times': '×', r'\pm': '±', r'\div': '÷', r'\cdot': '·',
73
  r'\approx': '≈', r'\leq': '≤', r'\geq': '≥', r'\neq': '≠',
@@ -76,11 +67,9 @@ def clean_mathrm(text):
76
  }
77
  for latex_cmd, unicode_char in replacements.items():
78
  math_content = math_content.replace(latex_cmd, unicode_char)
79
-
80
  return math_content
81
-
82
  text = re.sub(r'\\\((.+?)\\\)', process_math_block, text, flags=re.DOTALL)
83
-
84
  def process_bracket_block(m):
85
  class FakeMatch:
86
  def __init__(self, content):
@@ -89,16 +78,12 @@ def clean_mathrm(text):
89
  return self.content
90
  content = process_math_block(FakeMatch(m.group(1)))
91
  return '[' + content + ']'
92
-
93
  text = re.sub(r'\\\[(.+?)\\\]', process_bracket_block, text, flags=re.DOTALL)
94
  text = re.sub(r'\\mathrm\{([^}]*)\}', r'\1', text)
95
  text = text.replace(r'\%', '%')
96
-
97
  lines = text.split('\n')
98
  cleaned_lines = [re.sub(r'[ \t]+', ' ', line).strip() for line in lines]
99
- text = '\n'.join(cleaned_lines)
100
-
101
- return text.strip()
102
 
103
  def clean_output(text, include_images=False, remove_labels=False):
104
  if not text:
@@ -106,7 +91,6 @@ def clean_output(text, include_images=False, remove_labels=False):
106
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
107
  matches = re.findall(pattern, text, re.DOTALL)
108
  img_num = 0
109
-
110
  for match in matches:
111
  if '<|ref|>image<|/ref|>' in match[0]:
112
  if include_images:
@@ -119,58 +103,48 @@ def clean_output(text, include_images=False, remove_labels=False):
119
  text = text.replace(match[0], '', 1)
120
  else:
121
  text = text.replace(match[0], match[1], 1)
122
-
123
- text = clean_mathrm(text)
124
- return text.strip()
125
 
126
- # ==================== OCR FUNCTIONS ====================
127
  @spaces.GPU
128
  def ocr_process_image(image, mode="Crab"):
129
  if image is None:
130
  return "Error: Upload image"
131
-
132
- # Move model to GPU inside the @spaces.GPU decorated function
133
  device = "cuda" if torch.cuda.is_available() else "cpu"
134
  ocr_model.to(device)
135
-
136
  if image.mode in ('RGBA', 'LA', 'P'):
137
  image = image.convert('RGB')
138
  image = ImageOps.exif_transpose(image)
139
-
140
  config = MODEL_CONFIGS[mode]
141
  prompt = "<image>\n<|grounding|>Convert the document to markdown."
142
-
143
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
144
  image.save(tmp.name, 'JPEG', quality=95)
145
  tmp.close()
146
  out_dir = tempfile.mkdtemp()
147
-
148
  stdout = sys.stdout
149
  sys.stdout = StringIO()
150
-
151
  try:
152
  ocr_model.infer(
153
- tokenizer=ocr_tokenizer,
154
- prompt=prompt,
155
- image_file=tmp.name,
156
  output_path=out_dir,
157
- base_size=config["base_size"],
158
- image_size=config["image_size"],
159
  crop_mode=config["crop_mode"]
160
  )
161
-
162
- result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
163
  if not any(s in l for s in ['image:', 'other:', 'PATCHES', '====', 'BASE:', '%|', 'torch.Size'])]).strip()
164
  finally:
165
  sys.stdout = stdout
166
- os.unlink(tmp.name)
 
 
 
167
  shutil.rmtree(out_dir, ignore_errors=True)
168
-
169
  if not result:
170
  return "No text detected"
171
-
172
- markdown = clean_output(result, True, True)
173
- return markdown
174
 
175
  def ocr_process_pdf(path, mode, page_num):
176
  doc = fitz.open(path)
@@ -178,12 +152,10 @@ def ocr_process_pdf(path, mode, page_num):
178
  if page_num < 1 or page_num > total_pages:
179
  doc.close()
180
  return f"Invalid page number. PDF has {total_pages} pages."
181
-
182
  page = doc.load_page(page_num - 1)
183
  pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
184
  img = Image.open(BytesIO(pix.tobytes("png")))
185
  doc.close()
186
-
187
  return ocr_process_image(img, mode)
188
 
189
  def ocr_process_file(path, mode, page_num):
@@ -194,58 +166,46 @@ def ocr_process_file(path, mode, page_num):
194
  else:
195
  return ocr_process_image(Image.open(path), mode)
196
 
197
- # ==================== TRANSLATION FUNCTIONS ====================
198
  def split_by_sentences(text: str, max_words: int = 100):
199
  def count_words(t):
200
  return len(t.strip().split())
201
-
202
  chunks = []
203
  lines = text.split('\n')
204
-
205
  i = 0
206
  while i < len(lines):
207
  line = lines[i]
208
-
209
  empty_count = 0
210
  if not line.strip():
211
  while i < len(lines) and not lines[i].strip():
212
  empty_count += 1
213
  i += 1
214
-
215
  if chunks:
216
  prev_text, prev_newlines = chunks[-1]
217
  chunks[-1] = (prev_text, prev_newlines + empty_count)
218
  continue
219
-
220
  line = line.strip()
221
  is_last_line = (i == len(lines) - 1)
222
-
223
  if count_words(line) <= max_words:
224
  chunks.append((line, 0 if is_last_line else 1))
225
  i += 1
226
  continue
227
-
228
  sentences = re.split(r'(?<=[.!?])\s+', line)
229
  current_chunk = ""
230
  current_words = 0
231
-
232
- for sent_idx, sentence in enumerate(sentences):
233
  sentence = sentence.strip()
234
  if not sentence:
235
  continue
236
-
237
  sentence_words = count_words(sentence)
238
-
239
  if sentence_words > max_words:
240
  if current_chunk:
241
  chunks.append((current_chunk.strip(), 0))
242
  current_chunk = ""
243
  current_words = 0
244
-
245
  sub_parts = re.split(r',\s*', sentence)
246
  temp_chunk = ""
247
  temp_words = 0
248
-
249
  for part in sub_parts:
250
  part_words = count_words(part)
251
  if temp_words + part_words > max_words and temp_chunk:
@@ -258,11 +218,9 @@ def split_by_sentences(text: str, max_words: int = 100):
258
  else:
259
  temp_chunk = part
260
  temp_words += part_words
261
-
262
  if temp_chunk.strip():
263
  current_chunk = temp_chunk.strip()
264
  current_words = temp_words
265
-
266
  elif current_words + sentence_words <= max_words:
267
  if current_chunk:
268
  current_chunk += " " + sentence
@@ -273,18 +231,14 @@ def split_by_sentences(text: str, max_words: int = 100):
273
  chunks.append((current_chunk.strip(), 0))
274
  current_chunk = sentence
275
  current_words = sentence_words
276
-
277
  if current_chunk.strip():
278
  chunks.append((current_chunk.strip(), 0 if is_last_line else 1))
279
-
280
  i += 1
281
-
282
  return chunks
283
 
284
  @spaces.GPU
285
  def translate_chunk(chunk_text):
286
  device = "cuda" if torch.cuda.is_available() else "cpu"
287
- # Ensure translator is on correct device
288
  if hasattr(translator, 'model') and hasattr(translator.model, 'to'):
289
  translator.model.to(device)
290
  return translator.translate(chunk_text, max_new_tokens=2048).strip()
@@ -293,37 +247,31 @@ def streaming_translate(text: str):
293
  if not text or not text.strip():
294
  yield '<div style="padding:20px; color:#ff6b6b;">⚠️ Vui lòng nhập văn bản tiếng Anh để dịch.</div>'
295
  return
296
-
297
  chunks = split_by_sentences(text, max_words=100)
298
  accumulated = ""
299
-
300
  for i, (chunk_text, newline_count) in enumerate(chunks):
301
  try:
302
  translated = translate_chunk(chunk_text)
303
-
304
  if accumulated and not accumulated.endswith('\n'):
305
  accumulated += " " + translated
306
  else:
307
  accumulated += translated
308
-
309
  chunk_start = len(accumulated) - len(translated)
310
  for j in range(len(translated)):
311
  current_display = accumulated[:chunk_start + j + 1]
312
  html_output = f'<div style="padding:20px; line-height:1.8; font-size:15px; white-space:pre-wrap; font-family:Arial,sans-serif;">{current_display}</div>'
313
  yield html_output
314
  time.sleep(0.015)
315
-
316
  if newline_count > 0:
317
  actual_newlines = min(newline_count, 2)
318
  accumulated += "\n" * actual_newlines
319
  html_output = f'<div style="padding:20px; line-height:1.8; font-size:15px; white-space:pre-wrap; font-family:Arial,sans-serif;">{accumulated}</div>'
320
  yield html_output
321
-
322
  except Exception as e:
323
  yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi dịch chunk {i+1}: {str(e)}</div>'
324
  return
325
 
326
- # ==================== UI HELPER FUNCTIONS ====================
327
  def load_image(file_path, page_num_str="1"):
328
  if not file_path:
329
  return None
@@ -332,7 +280,6 @@ def load_image(file_path, page_num_str="1"):
332
  page_num = int(page_num_str)
333
  except (ValueError, TypeError):
334
  page_num = 1
335
-
336
  if file_path.lower().endswith('.pdf'):
337
  doc = fitz.open(file_path)
338
  page_idx = max(0, min(page_num - 1, len(doc) - 1))
@@ -375,75 +322,66 @@ def update_page_info(file_path):
375
 
376
  # ==================== COMBINED OCR + TRANSLATION ====================
377
  def ocr_and_translate_streaming(file_path, mode, page_num_str):
378
- """Hàm kết hợp: OCR trước, sau đó dịch streaming"""
379
  if not file_path:
380
  yield '<div style="padding:20px; color:#ff6b6b;">⚠️ Vui lòng tải file lên trước!</div>'
381
  return
382
-
383
  yield '<div style="padding:20px; color:#4CAF50;">🔍 Đang quét OCR...</div>'
384
  try:
385
  try:
386
  page_num = int(page_num_str)
387
  except (ValueError, TypeError):
388
  page_num = 1
389
-
390
  markdown = ocr_process_file(file_path, mode, page_num)
391
-
392
  if not markdown or markdown.startswith("Error") or markdown.startswith("Invalid"):
393
  yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi OCR: {markdown}</div>'
394
  return
395
-
396
  except Exception as e:
397
  yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi OCR: {str(e)}</div>'
398
  return
399
-
400
  yield '<div style="padding:20px; color:#2196F3;">🦀 Đang dịch...</div>'
401
  time.sleep(0.5)
402
-
403
  try:
404
  yield from streaming_translate(markdown)
405
  except Exception as e:
406
  yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi dịch: {str(e)}</div>'
407
 
408
  # ==================== GRADIO INTERFACE ====================
 
 
 
 
 
 
 
 
 
 
 
 
 
409
 
410
  with gr.Blocks(theme=gr.themes.Soft(), title="MedCrab Translation") as demo:
411
-
412
  gr.Markdown("""
413
  <div style="text-align: center;">
414
  <h1>🦀 MedCrab Translation</h1>
415
  <p style="font-size: 18px;"><b>Quét PDF Y khoa → Dịch trực tiếp sang tiếng Việt (Streaming)</b></p>
416
- <p style="font-size: 14px; color: #666;">
417
- Model: <a href="https://huggingface.co/pnnbao-ump/MedCrab-1.5B" target="_blank">MedCrab-1.5B</a>
418
- | Repo: <a href="https://github.com/pnnbao97/MedCrab" target="_blank">GitHub</a>
419
- | Tác giả: <b>Phạm Nguyễn Ngọc Bảo</b>
420
- </p>
421
- <p style="font-size: 13px; color: #ff9800;">
422
- 🚀 <b>Coming Soon:</b> MedCrab-8B sẽ ra mắt trong vài tuần tới!
423
- </p>
424
  </div>
425
  """)
426
-
427
  with gr.Row():
428
  with gr.Column(scale=1):
429
  gr.Markdown("### 📤 Tải file lên")
430
  file_in = gr.File(label="PDF hoặc Hình ảnh", file_types=["image", ".pdf"], type="filepath")
431
  input_img = gr.Image(label="Xem trước", type="pil", height=300)
432
-
433
- page_input = gr.Textbox(
434
- label="Số trang (chỉ dùng cho PDF, mặc định: 1)",
435
- value="1",
436
- placeholder="Nhập số trang..."
437
- )
438
  mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Crab", label="Chế độ OCR")
439
-
440
  gr.Markdown("### 🦀 Quét và Dịch")
441
  process_btn = gr.Button("🚀 Quét OCR + Dịch tiếng Việt", variant="primary", size="lg")
442
-
443
  with gr.Column(scale=2):
444
  gr.Markdown("### 📄 Kết quả dịch tiếng Việt (Streaming)")
445
  translation_output = gr.HTML(label="", value="")
446
-
447
  with gr.Accordion("📚 Ví dụ mẫu", open=True):
448
  gr.Markdown("**Thử ngay với các ví dụ có sẵn:**")
449
  gr.Examples(
@@ -457,49 +395,24 @@ with gr.Blocks(theme=gr.themes.Soft(), title="MedCrab Translation") as demo:
457
  cache_examples=False,
458
  label="Nhấp vào ví dụ để thử"
459
  )
460
-
461
  with gr.Accordion("⚖️ Giấy phép & Liên hệ", open=False):
462
  gr.Markdown("""
463
- **Giấy phép:** CC BY-NC 4.0 (Creative Commons Attribution-NonCommercial 4.0 International)
464
-
465
- ✅ **Được phép:**
466
- - Sử dụng cá nhân
467
- - Nghiên cứu học thuật
468
- - Giáo dục
469
-
470
- ❌ **Không được phép:**
471
- - Sử dụng thương mại
472
- - Triển khai tại bệnh viện/phòng khám mà không có giấy phép
473
-
474
- **💼 Nhu cầu thương mại:**
475
- Nếu bạn đại diện cho bệnh viện, phòng khám hoặc tổ chức y tế muốn sử dụng MedCrab cho mục đích thương mại,
476
- vui lòng liên hệ trực tiếp tác giả:
477
-
478
- 👤 **Phạm Nguyễn Ngọc Bảo**
479
- 📧 Facebook: [facebook.com/bao.phamnguyenngoc.5](https://www.facebook.com/bao.phamnguyenngoc.5/)
480
  """)
481
-
482
  file_in.change(load_image, [file_in, page_input], [input_img])
483
  file_in.change(update_page_info, [file_in], [page_input])
484
  page_input.change(load_image, [file_in, page_input], [input_img])
485
-
486
- process_btn.click(
487
- ocr_and_translate_streaming,
488
- [file_in, mode, page_input],
489
- [translation_output]
490
- )
491
 
492
- def load_default_example():
493
- file_path = "images/example1.png"
494
- img = Image.open(file_path)
495
- return file_path, img
496
-
497
- demo.load(
498
- load_default_example,
499
- inputs=None,
500
- outputs=[file_in, input_img] # cập nhật cả file_in và input_img
501
- )
502
 
503
  if __name__ == "__main__":
504
  print("🚀 Starting MedCrab Translation on Hugging Face Spaces...")
505
- demo.queue(max_size=20).launch()
 
10
  import fitz
11
  import re
12
  import time
 
 
13
  from io import StringIO, BytesIO
14
  import spaces
15
 
16
+ # ==================== CONFIG ====================
17
  OCR_MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'
18
+ MODEL_CONFIGS = {
19
+ "Crab": {"base_size": 1024, "image_size": 640, "crop_mode": True},
20
+ "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
21
+ }
22
 
23
+ # ==================== LOAD MODELS ====================
24
  print("🔄 Loading OCR model...")
25
  ocr_tokenizer = AutoTokenizer.from_pretrained(OCR_MODEL_NAME, trust_remote_code=True)
 
26
  try:
27
  ocr_model = AutoModel.from_pretrained(
28
+ OCR_MODEL_NAME,
29
+ attn_implementation='flash_attention_2',
30
+ torch_dtype=torch.bfloat16,
31
+ trust_remote_code=True,
32
  use_safetensors=True
33
  )
34
  print("✅ Using Flash Attention 2")
35
  except (ImportError, ValueError):
36
  print("⚠️ Flash Attention 2 not available, using eager attention")
37
  ocr_model = AutoModel.from_pretrained(
38
+ OCR_MODEL_NAME,
39
+ attn_implementation='eager',
40
+ torch_dtype=torch.bfloat16,
41
+ trust_remote_code=True,
42
  use_safetensors=True
43
  )
 
 
44
  ocr_model = ocr_model.eval()
45
 
 
 
 
 
 
 
46
  print("🦀 Loading MedCrab translator...")
47
  device = "cuda" if torch.cuda.is_available() else "cpu"
48
  translator = MedCrabTranslator(device=device)
49
  print(f"✅ MedCrab translator loaded on {device}")
50
 
51
+ # ==================== TEXT CLEANING ====================
52
  def clean_mathrm(text):
 
53
  if not text:
54
  return ""
 
55
  def process_math_block(match):
56
  math_content = match.group(1)
57
  math_content = re.sub(r'\\mathrm\{([^}]*)\}', r'\1', math_content)
 
59
  math_content = re.sub(r'\^([A-Za-z0-9+\-]+)', r'<sup>\1</sup>', math_content)
60
  math_content = re.sub(r'_\{([^}]+)\}', r'<sub>\1</sub>', math_content)
61
  math_content = re.sub(r'_([A-Za-z0-9+\-]+)', r'<sub>\1</sub>', math_content)
 
62
  replacements = {
63
  r'\times': '×', r'\pm': '±', r'\div': '÷', r'\cdot': '·',
64
  r'\approx': '≈', r'\leq': '≤', r'\geq': '≥', r'\neq': '≠',
 
67
  }
68
  for latex_cmd, unicode_char in replacements.items():
69
  math_content = math_content.replace(latex_cmd, unicode_char)
 
70
  return math_content
71
+
72
  text = re.sub(r'\\\((.+?)\\\)', process_math_block, text, flags=re.DOTALL)
 
73
  def process_bracket_block(m):
74
  class FakeMatch:
75
  def __init__(self, content):
 
78
  return self.content
79
  content = process_math_block(FakeMatch(m.group(1)))
80
  return '[' + content + ']'
 
81
  text = re.sub(r'\\\[(.+?)\\\]', process_bracket_block, text, flags=re.DOTALL)
82
  text = re.sub(r'\\mathrm\{([^}]*)\}', r'\1', text)
83
  text = text.replace(r'\%', '%')
 
84
  lines = text.split('\n')
85
  cleaned_lines = [re.sub(r'[ \t]+', ' ', line).strip() for line in lines]
86
+ return '\n'.join(cleaned_lines).strip()
 
 
87
 
88
  def clean_output(text, include_images=False, remove_labels=False):
89
  if not text:
 
91
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
92
  matches = re.findall(pattern, text, re.DOTALL)
93
  img_num = 0
 
94
  for match in matches:
95
  if '<|ref|>image<|/ref|>' in match[0]:
96
  if include_images:
 
103
  text = text.replace(match[0], '', 1)
104
  else:
105
  text = text.replace(match[0], match[1], 1)
106
+ return clean_mathrm(text).strip()
 
 
107
 
108
+ # ==================== OCR HELPERS ====================
109
  @spaces.GPU
110
  def ocr_process_image(image, mode="Crab"):
111
  if image is None:
112
  return "Error: Upload image"
 
 
113
  device = "cuda" if torch.cuda.is_available() else "cpu"
114
  ocr_model.to(device)
 
115
  if image.mode in ('RGBA', 'LA', 'P'):
116
  image = image.convert('RGB')
117
  image = ImageOps.exif_transpose(image)
 
118
  config = MODEL_CONFIGS[mode]
119
  prompt = "<image>\n<|grounding|>Convert the document to markdown."
 
120
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
121
  image.save(tmp.name, 'JPEG', quality=95)
122
  tmp.close()
123
  out_dir = tempfile.mkdtemp()
 
124
  stdout = sys.stdout
125
  sys.stdout = StringIO()
 
126
  try:
127
  ocr_model.infer(
128
+ tokenizer=ocr_tokenizer,
129
+ prompt=prompt,
130
+ image_file=tmp.name,
131
  output_path=out_dir,
132
+ base_size=config["base_size"],
133
+ image_size=config["image_size"],
134
  crop_mode=config["crop_mode"]
135
  )
136
+ result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
 
137
  if not any(s in l for s in ['image:', 'other:', 'PATCHES', '====', 'BASE:', '%|', 'torch.Size'])]).strip()
138
  finally:
139
  sys.stdout = stdout
140
+ try:
141
+ os.unlink(tmp.name)
142
+ except:
143
+ pass
144
  shutil.rmtree(out_dir, ignore_errors=True)
 
145
  if not result:
146
  return "No text detected"
147
+ return clean_output(result, True, True)
 
 
148
 
149
  def ocr_process_pdf(path, mode, page_num):
150
  doc = fitz.open(path)
 
152
  if page_num < 1 or page_num > total_pages:
153
  doc.close()
154
  return f"Invalid page number. PDF has {total_pages} pages."
 
155
  page = doc.load_page(page_num - 1)
156
  pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
157
  img = Image.open(BytesIO(pix.tobytes("png")))
158
  doc.close()
 
159
  return ocr_process_image(img, mode)
160
 
161
  def ocr_process_file(path, mode, page_num):
 
166
  else:
167
  return ocr_process_image(Image.open(path), mode)
168
 
169
+ # ==================== TRANSLATION HELPERS ====================
170
  def split_by_sentences(text: str, max_words: int = 100):
171
  def count_words(t):
172
  return len(t.strip().split())
 
173
  chunks = []
174
  lines = text.split('\n')
 
175
  i = 0
176
  while i < len(lines):
177
  line = lines[i]
 
178
  empty_count = 0
179
  if not line.strip():
180
  while i < len(lines) and not lines[i].strip():
181
  empty_count += 1
182
  i += 1
 
183
  if chunks:
184
  prev_text, prev_newlines = chunks[-1]
185
  chunks[-1] = (prev_text, prev_newlines + empty_count)
186
  continue
 
187
  line = line.strip()
188
  is_last_line = (i == len(lines) - 1)
 
189
  if count_words(line) <= max_words:
190
  chunks.append((line, 0 if is_last_line else 1))
191
  i += 1
192
  continue
 
193
  sentences = re.split(r'(?<=[.!?])\s+', line)
194
  current_chunk = ""
195
  current_words = 0
196
+ for sentence in sentences:
 
197
  sentence = sentence.strip()
198
  if not sentence:
199
  continue
 
200
  sentence_words = count_words(sentence)
 
201
  if sentence_words > max_words:
202
  if current_chunk:
203
  chunks.append((current_chunk.strip(), 0))
204
  current_chunk = ""
205
  current_words = 0
 
206
  sub_parts = re.split(r',\s*', sentence)
207
  temp_chunk = ""
208
  temp_words = 0
 
209
  for part in sub_parts:
210
  part_words = count_words(part)
211
  if temp_words + part_words > max_words and temp_chunk:
 
218
  else:
219
  temp_chunk = part
220
  temp_words += part_words
 
221
  if temp_chunk.strip():
222
  current_chunk = temp_chunk.strip()
223
  current_words = temp_words
 
224
  elif current_words + sentence_words <= max_words:
225
  if current_chunk:
226
  current_chunk += " " + sentence
 
231
  chunks.append((current_chunk.strip(), 0))
232
  current_chunk = sentence
233
  current_words = sentence_words
 
234
  if current_chunk.strip():
235
  chunks.append((current_chunk.strip(), 0 if is_last_line else 1))
 
236
  i += 1
 
237
  return chunks
238
 
239
  @spaces.GPU
240
  def translate_chunk(chunk_text):
241
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
242
  if hasattr(translator, 'model') and hasattr(translator.model, 'to'):
243
  translator.model.to(device)
244
  return translator.translate(chunk_text, max_new_tokens=2048).strip()
 
247
  if not text or not text.strip():
248
  yield '<div style="padding:20px; color:#ff6b6b;">⚠️ Vui lòng nhập văn bản tiếng Anh để dịch.</div>'
249
  return
 
250
  chunks = split_by_sentences(text, max_words=100)
251
  accumulated = ""
 
252
  for i, (chunk_text, newline_count) in enumerate(chunks):
253
  try:
254
  translated = translate_chunk(chunk_text)
 
255
  if accumulated and not accumulated.endswith('\n'):
256
  accumulated += " " + translated
257
  else:
258
  accumulated += translated
 
259
  chunk_start = len(accumulated) - len(translated)
260
  for j in range(len(translated)):
261
  current_display = accumulated[:chunk_start + j + 1]
262
  html_output = f'<div style="padding:20px; line-height:1.8; font-size:15px; white-space:pre-wrap; font-family:Arial,sans-serif;">{current_display}</div>'
263
  yield html_output
264
  time.sleep(0.015)
 
265
  if newline_count > 0:
266
  actual_newlines = min(newline_count, 2)
267
  accumulated += "\n" * actual_newlines
268
  html_output = f'<div style="padding:20px; line-height:1.8; font-size:15px; white-space:pre-wrap; font-family:Arial,sans-serif;">{accumulated}</div>'
269
  yield html_output
 
270
  except Exception as e:
271
  yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi dịch chunk {i+1}: {str(e)}</div>'
272
  return
273
 
274
+ # ==================== UI HELPERS ====================
275
  def load_image(file_path, page_num_str="1"):
276
  if not file_path:
277
  return None
 
280
  page_num = int(page_num_str)
281
  except (ValueError, TypeError):
282
  page_num = 1
 
283
  if file_path.lower().endswith('.pdf'):
284
  doc = fitz.open(file_path)
285
  page_idx = max(0, min(page_num - 1, len(doc) - 1))
 
322
 
323
  # ==================== COMBINED OCR + TRANSLATION ====================
324
  def ocr_and_translate_streaming(file_path, mode, page_num_str):
 
325
  if not file_path:
326
  yield '<div style="padding:20px; color:#ff6b6b;">⚠️ Vui lòng tải file lên trước!</div>'
327
  return
 
328
  yield '<div style="padding:20px; color:#4CAF50;">🔍 Đang quét OCR...</div>'
329
  try:
330
  try:
331
  page_num = int(page_num_str)
332
  except (ValueError, TypeError):
333
  page_num = 1
 
334
  markdown = ocr_process_file(file_path, mode, page_num)
 
335
  if not markdown or markdown.startswith("Error") or markdown.startswith("Invalid"):
336
  yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi OCR: {markdown}</div>'
337
  return
 
338
  except Exception as e:
339
  yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi OCR: {str(e)}</div>'
340
  return
 
341
  yield '<div style="padding:20px; color:#2196F3;">🦀 Đang dịch...</div>'
342
  time.sleep(0.5)
 
343
  try:
344
  yield from streaming_translate(markdown)
345
  except Exception as e:
346
  yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi dịch: {str(e)}</div>'
347
 
348
  # ==================== GRADIO INTERFACE ====================
349
+ def load_default_example():
350
+ src = "images/example1.png"
351
+ if not os.path.exists(src):
352
+ # fallback: return empty values
353
+ return None, None
354
+ tmp_path = "/tmp/example1.png"
355
+ try:
356
+ shutil.copy(src, tmp_path)
357
+ except Exception:
358
+ # if copy fails, try to use src directly
359
+ tmp_path = src
360
+ img = Image.open(tmp_path)
361
+ return tmp_path, img
362
 
363
  with gr.Blocks(theme=gr.themes.Soft(), title="MedCrab Translation") as demo:
 
364
  gr.Markdown("""
365
  <div style="text-align: center;">
366
  <h1>🦀 MedCrab Translation</h1>
367
  <p style="font-size: 18px;"><b>Quét PDF Y khoa → Dịch trực tiếp sang tiếng Việt (Streaming)</b></p>
368
+ <p style="font-size: 14px; color: #666;">Model: <a href="https://huggingface.co/pnnbao-ump/MedCrab-1.5B" target="_blank">MedCrab-1.5B</a></p>
 
 
 
 
 
 
 
369
  </div>
370
  """)
371
+
372
  with gr.Row():
373
  with gr.Column(scale=1):
374
  gr.Markdown("### 📤 Tải file lên")
375
  file_in = gr.File(label="PDF hoặc Hình ảnh", file_types=["image", ".pdf"], type="filepath")
376
  input_img = gr.Image(label="Xem trước", type="pil", height=300)
377
+ page_input = gr.Textbox(label="Số trang (chỉ dùng cho PDF, mặc định: 1)", value="1", placeholder="Nhập số trang...")
 
 
 
 
 
378
  mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Crab", label="Chế độ OCR")
 
379
  gr.Markdown("### 🦀 Quét và Dịch")
380
  process_btn = gr.Button("🚀 Quét OCR + Dịch tiếng Việt", variant="primary", size="lg")
 
381
  with gr.Column(scale=2):
382
  gr.Markdown("### 📄 Kết quả dịch tiếng Việt (Streaming)")
383
  translation_output = gr.HTML(label="", value="")
384
+
385
  with gr.Accordion("📚 Ví dụ mẫu", open=True):
386
  gr.Markdown("**Thử ngay với các ví dụ có sẵn:**")
387
  gr.Examples(
 
395
  cache_examples=False,
396
  label="Nhấp vào ví dụ để thử"
397
  )
398
+
399
  with gr.Accordion("⚖️ Giấy phép & Liên hệ", open=False):
400
  gr.Markdown("""
401
+ **Giấy phép:** CC BY-NC 4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  """)
403
+ # Events
404
  file_in.change(load_image, [file_in, page_input], [input_img])
405
  file_in.change(update_page_info, [file_in], [page_input])
406
  page_input.change(load_image, [file_in, page_input], [input_img])
407
+ process_btn.click(ocr_and_translate_streaming, [file_in, mode, page_input], [translation_output])
 
 
 
 
 
408
 
409
+ # Load default example into both file_in (filepath) and input_img (PIL) when UI starts
410
+ demo.load(
411
+ load_default_example,
412
+ inputs=None,
413
+ outputs=[file_in, input_img]
414
+ )
 
 
 
 
415
 
416
  if __name__ == "__main__":
417
  print("🚀 Starting MedCrab Translation on Hugging Face Spaces...")
418
+ demo.queue(max_size=20).launch()