UnMelow commited on
Commit
b20d7cc
·
verified ·
1 Parent(s): d77255d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -172
app.py CHANGED
@@ -1,47 +1,52 @@
1
  import os
2
  import re
3
- import tempfile
4
  from io import BytesIO
5
- from typing import List, Tuple, Optional
6
 
7
  import gradio as gr
8
  import torch
9
  import numpy as np
10
- from PIL import Image, ImageDraw, ImageFont, ImageOps
11
  import fitz # PyMuPDF
12
 
13
  from transformers import (
14
- AutoProcessor,
15
  VisionEncoderDecoderModel,
16
  BlipProcessor,
17
  BlipForConditionalGeneration,
18
  )
 
19
 
20
  # -------------------------
21
- # CPU-only setup
22
  # -------------------------
 
 
 
23
  DEVICE = torch.device("cpu")
24
  torch.set_num_threads(int(os.getenv("TORCH_NUM_THREADS", "4")))
25
 
26
  TROCR_NAME = os.getenv("TROCR_MODEL", "microsoft/trocr-base-printed")
27
  BLIP_NAME = os.getenv("BLIP_MODEL", "Salesforce/blip-image-captioning-base")
28
 
 
 
 
29
  # -------------------------
30
  # Models (CPU)
31
  # -------------------------
32
- trocr_processor = AutoProcessor.from_pretrained(TROCR_NAME)
33
  trocr_model = VisionEncoderDecoderModel.from_pretrained(TROCR_NAME).eval().to(DEVICE)
34
 
35
  blip_processor = BlipProcessor.from_pretrained(BLIP_NAME)
36
  blip_model = BlipForConditionalGeneration.from_pretrained(BLIP_NAME).eval().to(DEVICE)
37
 
38
  # -------------------------
39
- # Optional: pytesseract (for boxes on images)
40
  # -------------------------
41
  def _try_import_tesseract():
42
  try:
43
  import pytesseract # type: ignore
44
- # Quick sanity check: version call triggers binary lookup
45
  _ = pytesseract.get_tesseract_version()
46
  return pytesseract
47
  except Exception:
@@ -49,44 +54,28 @@ def _try_import_tesseract():
49
 
50
  PYTESS = _try_import_tesseract()
51
 
52
- # -------------------------
53
- # UI / tasks
54
- # -------------------------
55
- TASKS = [
56
- "OCR",
57
- "Markdown",
58
- "Locate",
59
- "Describe",
60
- ]
61
-
62
- DEFAULT_DPI = 200 # PDF render DPI
63
 
64
 
65
  # -------------------------
66
  # Helpers
67
  # -------------------------
68
- def _safe_font(size: int = 28):
69
- candidates = [
70
- "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
71
- "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
72
- ]
73
- for p in candidates:
74
- try:
75
- if os.path.exists(p):
76
- return ImageFont.truetype(p, size)
77
- except Exception:
78
- pass
79
- return ImageFont.load_default()
80
-
81
-
82
  def _to_rgb(img: Image.Image) -> Image.Image:
83
  if img.mode in ("RGBA", "LA", "P"):
84
  img = img.convert("RGB")
85
- return ImageOps.exif_transpose(img)
 
 
 
 
 
 
 
 
86
 
87
 
88
  def _tokenize(s: str) -> List[str]:
89
- return re.findall(r"[A-Za-zА-Яа-я0-9]+", s.lower())
90
 
91
 
92
  def trocr_ocr(img: Image.Image) -> str:
@@ -96,7 +85,7 @@ def trocr_ocr(img: Image.Image) -> str:
96
  with torch.no_grad():
97
  ids = trocr_model.generate(pixel_values, max_new_tokens=256)
98
  text = trocr_processor.batch_decode(ids, skip_special_tokens=True)[0]
99
- return text.strip()
100
 
101
 
102
  def blip_describe(img: Image.Image) -> str:
@@ -107,33 +96,25 @@ def blip_describe(img: Image.Image) -> str:
107
  return blip_processor.decode(out[0], skip_special_tokens=True).strip()
108
 
109
 
110
- def render_pdf_page(path: str, page_num: int, dpi: int = DEFAULT_DPI) -> Tuple[fitz.Document, fitz.Page, Image.Image, float]:
111
  doc = fitz.open(path)
112
- page_idx = max(0, min(page_num - 1, len(doc) - 1))
113
  page = doc.load_page(page_idx)
114
  zoom = dpi / 72.0
115
  pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False)
116
  img = Image.open(BytesIO(pix.tobytes("png")))
117
- return doc, page, img, zoom
118
 
119
 
120
  def pdf_has_text(page: fitz.Page) -> bool:
121
- # words is empty for scanned pages
122
- words = page.get_text("words")
123
- return bool(words)
124
 
125
 
126
  def pdf_extract_text(page: fitz.Page) -> str:
127
- txt = page.get_text("text") or ""
128
- return txt.strip()
129
 
130
 
131
  def pdf_to_markdown_simple(page: fitz.Page) -> str:
132
- """
133
- Lightweight markdown for selectable-text PDFs.
134
- - Uses span sizes to guess headers.
135
- - No heavy layout logic (keeps it stable and fast on CPU).
136
- """
137
  data = page.get_text("dict")
138
  spans = []
139
  for b in data.get("blocks", []):
@@ -149,7 +130,7 @@ def pdf_to_markdown_simple(page: fitz.Page) -> str:
149
  h1_thr = med * 1.60
150
  h2_thr = med * 1.35
151
 
152
- lines_out: List[str] = []
153
  for b in data.get("blocks", []):
154
  if b.get("type") != 0:
155
  continue
@@ -157,26 +138,22 @@ def pdf_to_markdown_simple(page: fitz.Page) -> str:
157
  parts = []
158
  sizes = []
159
  for sp in ln.get("spans", []):
160
- t = (sp.get("text") or "")
161
- if t.strip():
162
- parts.append(t.strip())
163
  sizes.append(float(sp.get("size", 0.0)))
164
  if not parts:
165
  continue
166
  line = " ".join(parts).strip()
167
  sz = max(sizes) if sizes else med
168
-
169
  if sz >= h1_thr:
170
- lines_out.append("# " + line)
171
  elif sz >= h2_thr:
172
- lines_out.append("## " + line)
173
  else:
174
- lines_out.append(line)
175
-
176
- lines_out.append("") # paragraph break
177
-
178
- md = "\n".join(lines_out).strip()
179
- return md
180
 
181
 
182
  def draw_rects(img: Image.Image, rects_px: List[Tuple[int, int, int, int]]) -> Image.Image:
@@ -192,23 +169,20 @@ def draw_rects(img: Image.Image, rects_px: List[Tuple[int, int, int, int]]) -> I
192
 
193
 
194
  def locate_in_pdf_words(page: fitz.Page, query: str) -> List[Tuple[float, float, float, float]]:
195
- """
196
- Returns list of rectangles in PDF coordinate space (points).
197
- Uses exact word sequence match (token-based).
198
- """
199
  q = _tokenize(query)
200
  if not q:
201
  return []
202
-
203
- words = page.get_text("words") # x0,y0,x1,y1,"word",block,line,wordno
204
  if not words:
205
  return []
206
 
207
- w_tokens = [_tokenize(w[4])[0] if _tokenize(w[4]) else "" for w in words]
208
- rects: List[Tuple[float, float, float, float]] = []
 
 
209
 
210
- n = len(w_tokens)
211
- m = len(q)
212
  for i in range(0, n - m + 1):
213
  if w_tokens[i:i + m] == q:
214
  xs0 = [float(words[j][0]) for j in range(i, i + m)]
@@ -216,24 +190,17 @@ def locate_in_pdf_words(page: fitz.Page, query: str) -> List[Tuple[float, float,
216
  xs1 = [float(words[j][2]) for j in range(i, i + m)]
217
  ys1 = [float(words[j][3]) for j in range(i, i + m)]
218
  rects.append((min(xs0), min(ys0), max(xs1), max(ys1)))
219
-
220
  return rects
221
 
222
 
223
- def locate_in_image_tesseract(img: Image.Image, query: str) -> Tuple[List[Tuple[int, int, int, int]], str]:
224
- """
225
- Returns pixel-space rectangles for located phrase, plus a short status message.
226
- If pytesseract is not available, returns empty list and message.
227
- """
228
  if PYTESS is None:
229
- return [], "Tesseract not available: no boxes for images."
230
-
231
  q = _tokenize(query)
232
  if not q:
233
  return [], "Empty query."
234
 
235
  img = _to_rgb(img)
236
- # Use data dict so it works consistently
237
  data = PYTESS.image_to_data(img, output_type=PYTESS.Output.DICT)
238
 
239
  texts = data.get("text", [])
@@ -249,148 +216,131 @@ def locate_in_image_tesseract(img: Image.Image, query: str) -> Tuple[List[Tuple[
249
  t = (t or "").strip()
250
  if not t:
251
  continue
252
- tok = _tokenize(t)
253
- if not tok:
254
  continue
255
- # Keep only "reasonable" confidence if numeric
256
  try:
257
  c = float(conf[i])
258
  if c < 0:
259
  continue
260
  except Exception:
261
  pass
262
-
263
- tokens.append(tok[0])
264
  boxes.append((int(left[i]), int(top[i]), int(left[i] + width[i]), int(top[i] + height[i])))
265
 
266
- rects: List[Tuple[int, int, int, int]] = []
267
- n = len(tokens)
268
- m = len(q)
269
  for i in range(0, n - m + 1):
270
  if tokens[i:i + m] == q:
271
  xs0 = [boxes[j][0] for j in range(i, i + m)]
272
  ys0 = [boxes[j][1] for j in range(i, i + m)]
273
  xs1 = [boxes[j][2] for j in range(i, i + m)]
274
  ys1 = [boxes[j][3] for j in range(i, i + m)]
275
- rects.append((min(xs0), min(ys0), max(xs1), max(ys1)))
276
 
277
- if not rects:
278
- return [], "Not found."
279
- return rects, "Found."
280
 
281
 
282
- def as_markdown_block(text: str) -> str:
283
- if not text.strip():
284
- return ""
285
- return "```text\n" + text.strip() + "\n```"
286
 
287
 
288
  # -------------------------
289
- # Main run
290
  # -------------------------
291
- def process(path: str, task: str, page_num: int, query: str):
292
- if not path:
293
- return "Upload a file.", "", None
294
 
295
- ext = os.path.splitext(path)[1].lower()
296
 
297
- # ---------- PDF ----------
298
  if ext == ".pdf":
299
- doc, page, page_img, zoom = render_pdf_page(path, page_num, dpi=DEFAULT_DPI)
300
  try:
 
 
301
  if task == "Describe":
302
- caption = blip_describe(page_img)
303
- return caption, as_markdown_block(caption), None
304
 
305
  if task == "OCR":
306
- if pdf_has_text(page):
307
- txt = pdf_extract_text(page)
308
- else:
309
- txt = trocr_ocr(page_img)
310
- return txt, as_markdown_block(txt), None
311
 
312
  if task == "Markdown":
313
  if pdf_has_text(page):
314
  md = pdf_to_markdown_simple(page)
315
  if not md:
316
- txt = pdf_extract_text(page)
317
- md = as_markdown_block(txt)
318
  else:
319
- txt = trocr_ocr(page_img)
320
- md = as_markdown_block(txt)
321
- return md, md, None
322
 
323
  if task == "Locate":
324
- if not query.strip():
325
- return "Enter text to locate.", "", page_img
326
 
327
- # 1) Prefer precise PDF word boxes (selectable text)
328
  rects_pdf = locate_in_pdf_words(page, query)
329
  if rects_pdf:
330
- # Convert PDF points -> pixels using same render zoom
331
- rects_px = []
332
- for (x0, y0, x1, y1) in rects_pdf:
333
- rects_px.append((int(x0 * zoom), int(y0 * zoom), int(x1 * zoom), int(y1 * zoom)))
334
  boxed = draw_rects(page_img, rects_px)
335
- return "Found.", "", boxed
336
 
337
- # 2) Fallback: if scanned page, try tesseract boxes on rendered image
338
  rects_px, msg = locate_in_image_tesseract(page_img, query)
339
  boxed = draw_rects(page_img, rects_px) if rects_px else page_img
340
- return msg, "", boxed
341
 
342
- return "Unknown task.", "", None
343
  finally:
344
  doc.close()
345
 
346
- # ---------- Image ----------
347
- img = _to_rgb(Image.open(path))
 
348
 
349
  if task == "Describe":
350
- caption = blip_describe(img)
351
- return caption, as_markdown_block(caption), None
352
 
353
  if task == "OCR":
354
  txt = trocr_ocr(img)
355
- return txt, as_markdown_block(txt), None
356
 
357
  if task == "Markdown":
358
- txt = trocr_ocr(img)
359
- md = as_markdown_block(txt)
360
- return md, md, None
361
 
362
  if task == "Locate":
363
- if not query.strip():
364
- return "Enter text to locate.", "", img
365
-
366
  rects_px, msg = locate_in_image_tesseract(img, query)
367
  boxed = draw_rects(img, rects_px) if rects_px else img
368
- return msg, "", boxed
369
 
370
- return "Unknown task.", "", None
371
 
372
 
373
  # -------------------------
374
- # UI helpers
375
  # -------------------------
376
- def update_page_selector(file_path: str):
377
  if not file_path:
378
- return gr.update(visible=False), gr.update(value=None)
379
 
380
  ext = os.path.splitext(file_path)[1].lower()
381
  if ext != ".pdf":
382
- return gr.update(visible=False), gr.update(value=_to_rgb(Image.open(file_path)))
383
 
384
  doc = fitz.open(file_path)
385
- pages = len(doc)
386
  doc.close()
387
 
388
- # Show first page preview
389
  _, _, img, _ = render_pdf_page(file_path, 1, dpi=DEFAULT_DPI)
390
- return (
391
- gr.update(visible=True, minimum=1, maximum=max(1, pages), value=1),
392
- gr.update(value=img),
393
- )
394
 
395
 
396
  def update_preview(file_path: str, page_num: int):
@@ -408,43 +358,37 @@ def toggle_query(task: str):
408
 
409
 
410
  # -------------------------
411
- # Build app (minimal style)
412
  # -------------------------
413
- theme = gr.themes.Base(
414
- font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"],
415
  )
416
 
417
  with gr.Blocks(theme=theme, title="Doc Tool (CPU)") as demo:
418
  with gr.Row():
419
  with gr.Column(scale=1, min_width=320):
420
  file_in = gr.File(label="File", file_types=["image", ".pdf"], type="filepath")
421
- page_num = gr.Slider(label="Page", minimum=1, maximum=1, value=1, step=1, visible=False)
422
  task = gr.Dropdown(label="Task", choices=TASKS, value="OCR")
423
- query = gr.Textbox(label="Query", visible=False, placeholder="Text to locate")
424
-
425
  run_btn = gr.Button("Run", variant="primary")
426
 
427
  with gr.Column(scale=2):
428
- preview = gr.Image(label="Preview", type="pil", height=360)
429
- out_text = gr.Textbox(label="Output", lines=10)
430
- out_md = gr.Markdown()
431
-
432
- out_boxes = gr.Image(label="Boxes", type="pil", height=360)
433
 
434
- file_in.change(update_page_selector, inputs=[file_in], outputs=[page_num, preview])
435
- page_num.change(update_preview, inputs=[file_in, page_num], outputs=[preview])
436
  task.change(toggle_query, inputs=[task], outputs=[query])
437
 
438
- def on_run(file_path, task_name, page, q):
439
- text, md, boxed = process(file_path, task_name, int(page), q or "")
440
- return text, md, boxed
 
441
 
442
- run_btn.click(
443
- on_run,
444
- inputs=[file_in, task, page_num, query],
445
- outputs=[out_text, out_md, out_boxes],
446
- )
447
 
448
  if __name__ == "__main__":
449
- # Disable SSR to avoid extra startup noise
450
  demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
 
1
  import os
2
  import re
 
3
  from io import BytesIO
4
+ from typing import List, Tuple
5
 
6
  import gradio as gr
7
  import torch
8
  import numpy as np
9
+ from PIL import Image, ImageDraw, ImageOps
10
  import fitz # PyMuPDF
11
 
12
  from transformers import (
13
+ TrOCRProcessor,
14
  VisionEncoderDecoderModel,
15
  BlipProcessor,
16
  BlipForConditionalGeneration,
17
  )
18
+ from transformers.utils import logging as hf_logging
19
 
20
  # -------------------------
21
+ # CPU-only, quieter logs
22
  # -------------------------
23
+ hf_logging.set_verbosity_error()
24
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
25
+
26
  DEVICE = torch.device("cpu")
27
  torch.set_num_threads(int(os.getenv("TORCH_NUM_THREADS", "4")))
28
 
29
  TROCR_NAME = os.getenv("TROCR_MODEL", "microsoft/trocr-base-printed")
30
  BLIP_NAME = os.getenv("BLIP_MODEL", "Salesforce/blip-image-captioning-base")
31
 
32
+ DEFAULT_DPI = 200
33
+ MAX_SIDE = int(os.getenv("MAX_SIDE", "1600")) # soft cap for CPU speed
34
+
35
  # -------------------------
36
  # Models (CPU)
37
  # -------------------------
38
+ trocr_processor = TrOCRProcessor.from_pretrained(TROCR_NAME)
39
  trocr_model = VisionEncoderDecoderModel.from_pretrained(TROCR_NAME).eval().to(DEVICE)
40
 
41
  blip_processor = BlipProcessor.from_pretrained(BLIP_NAME)
42
  blip_model = BlipForConditionalGeneration.from_pretrained(BLIP_NAME).eval().to(DEVICE)
43
 
44
  # -------------------------
45
+ # Optional: Tesseract for image boxes
46
  # -------------------------
47
  def _try_import_tesseract():
48
  try:
49
  import pytesseract # type: ignore
 
50
  _ = pytesseract.get_tesseract_version()
51
  return pytesseract
52
  except Exception:
 
54
 
55
  PYTESS = _try_import_tesseract()
56
 
57
+ TASKS = ["OCR", "Markdown", "Locate", "Describe"]
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
  # -------------------------
61
  # Helpers
62
  # -------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def _to_rgb(img: Image.Image) -> Image.Image:
64
  if img.mode in ("RGBA", "LA", "P"):
65
  img = img.convert("RGB")
66
+ img = ImageOps.exif_transpose(img)
67
+
68
+ # Keep CPU inference reasonable
69
+ w, h = img.size
70
+ m = max(w, h)
71
+ if m > MAX_SIDE:
72
+ scale = MAX_SIDE / float(m)
73
+ img = img.resize((int(w * scale), int(h * scale)), Image.Resampling.LANCZOS)
74
+ return img
75
 
76
 
77
  def _tokenize(s: str) -> List[str]:
78
+ return re.findall(r"[A-Za-zА-Яа-я0-9]+", (s or "").lower())
79
 
80
 
81
  def trocr_ocr(img: Image.Image) -> str:
 
85
  with torch.no_grad():
86
  ids = trocr_model.generate(pixel_values, max_new_tokens=256)
87
  text = trocr_processor.batch_decode(ids, skip_special_tokens=True)[0]
88
+ return (text or "").strip()
89
 
90
 
91
  def blip_describe(img: Image.Image) -> str:
 
96
  return blip_processor.decode(out[0], skip_special_tokens=True).strip()
97
 
98
 
99
+ def render_pdf_page(path: str, page_num: int, dpi: int = DEFAULT_DPI):
100
  doc = fitz.open(path)
101
+ page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
102
  page = doc.load_page(page_idx)
103
  zoom = dpi / 72.0
104
  pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False)
105
  img = Image.open(BytesIO(pix.tobytes("png")))
106
+ return doc, page, _to_rgb(img), zoom
107
 
108
 
109
  def pdf_has_text(page: fitz.Page) -> bool:
110
+ return bool(page.get_text("words"))
 
 
111
 
112
 
113
  def pdf_extract_text(page: fitz.Page) -> str:
114
+ return (page.get_text("text") or "").strip()
 
115
 
116
 
117
  def pdf_to_markdown_simple(page: fitz.Page) -> str:
 
 
 
 
 
118
  data = page.get_text("dict")
119
  spans = []
120
  for b in data.get("blocks", []):
 
130
  h1_thr = med * 1.60
131
  h2_thr = med * 1.35
132
 
133
+ out_lines: List[str] = []
134
  for b in data.get("blocks", []):
135
  if b.get("type") != 0:
136
  continue
 
138
  parts = []
139
  sizes = []
140
  for sp in ln.get("spans", []):
141
+ t = (sp.get("text") or "").strip()
142
+ if t:
143
+ parts.append(t)
144
  sizes.append(float(sp.get("size", 0.0)))
145
  if not parts:
146
  continue
147
  line = " ".join(parts).strip()
148
  sz = max(sizes) if sizes else med
 
149
  if sz >= h1_thr:
150
+ out_lines.append("# " + line)
151
  elif sz >= h2_thr:
152
+ out_lines.append("## " + line)
153
  else:
154
+ out_lines.append(line)
155
+ out_lines.append("")
156
+ return "\n".join(out_lines).strip()
 
 
 
157
 
158
 
159
  def draw_rects(img: Image.Image, rects_px: List[Tuple[int, int, int, int]]) -> Image.Image:
 
169
 
170
 
171
  def locate_in_pdf_words(page: fitz.Page, query: str) -> List[Tuple[float, float, float, float]]:
 
 
 
 
172
  q = _tokenize(query)
173
  if not q:
174
  return []
175
+ words = page.get_text("words")
 
176
  if not words:
177
  return []
178
 
179
+ w_tokens = []
180
+ for w in words:
181
+ toks = _tokenize(w[4])
182
+ w_tokens.append(toks[0] if toks else "")
183
 
184
+ rects = []
185
+ n, m = len(w_tokens), len(q)
186
  for i in range(0, n - m + 1):
187
  if w_tokens[i:i + m] == q:
188
  xs0 = [float(words[j][0]) for j in range(i, i + m)]
 
190
  xs1 = [float(words[j][2]) for j in range(i, i + m)]
191
  ys1 = [float(words[j][3]) for j in range(i, i + m)]
192
  rects.append((min(xs0), min(ys0), max(xs1), max(ys1)))
 
193
  return rects
194
 
195
 
196
+ def locate_in_image_tesseract(img: Image.Image, query: str):
 
 
 
 
197
  if PYTESS is None:
198
+ return [], "Tesseract not available."
 
199
  q = _tokenize(query)
200
  if not q:
201
  return [], "Empty query."
202
 
203
  img = _to_rgb(img)
 
204
  data = PYTESS.image_to_data(img, output_type=PYTESS.Output.DICT)
205
 
206
  texts = data.get("text", [])
 
216
  t = (t or "").strip()
217
  if not t:
218
  continue
219
+ toks = _tokenize(t)
220
+ if not toks:
221
  continue
 
222
  try:
223
  c = float(conf[i])
224
  if c < 0:
225
  continue
226
  except Exception:
227
  pass
228
+ tokens.append(toks[0])
 
229
  boxes.append((int(left[i]), int(top[i]), int(left[i] + width[i]), int(top[i] + height[i])))
230
 
231
+ rects_px = []
232
+ n, m = len(tokens), len(q)
 
233
  for i in range(0, n - m + 1):
234
  if tokens[i:i + m] == q:
235
  xs0 = [boxes[j][0] for j in range(i, i + m)]
236
  ys0 = [boxes[j][1] for j in range(i, i + m)]
237
  xs1 = [boxes[j][2] for j in range(i, i + m)]
238
  ys1 = [boxes[j][3] for j in range(i, i + m)]
239
+ rects_px.append((min(xs0), min(ys0), max(xs1), max(ys1)))
240
 
241
+ return rects_px, ("Found." if rects_px else "Not found.")
 
 
242
 
243
 
244
+ def as_text_block(s: str) -> str:
245
+ s = (s or "").strip()
246
+ return s if s else ""
 
247
 
248
 
249
  # -------------------------
250
+ # Core processing
251
  # -------------------------
252
+ def process(file_path: str, task: str, page_num: int, query: str):
253
+ if not file_path:
254
+ return "Upload a file.", "", None, None
255
 
256
+ ext = os.path.splitext(file_path)[1].lower()
257
 
258
+ # PDF
259
  if ext == ".pdf":
260
+ doc, page, page_img, zoom = render_pdf_page(file_path, page_num, dpi=DEFAULT_DPI)
261
  try:
262
+ preview = page_img
263
+
264
  if task == "Describe":
265
+ cap = blip_describe(page_img)
266
+ return cap, cap, None, preview
267
 
268
  if task == "OCR":
269
+ txt = pdf_extract_text(page) if pdf_has_text(page) else trocr_ocr(page_img)
270
+ return txt, txt, None, preview
 
 
 
271
 
272
  if task == "Markdown":
273
  if pdf_has_text(page):
274
  md = pdf_to_markdown_simple(page)
275
  if not md:
276
+ md = pdf_extract_text(page)
 
277
  else:
278
+ md = trocr_ocr(page_img)
279
+ return md, md, None, preview
 
280
 
281
  if task == "Locate":
282
+ if not (query or "").strip():
283
+ return "Enter query.", "", preview, preview
284
 
285
+ # selectable-text PDF: precise boxes
286
  rects_pdf = locate_in_pdf_words(page, query)
287
  if rects_pdf:
288
+ rects_px = [(int(x0 * zoom), int(y0 * zoom), int(x1 * zoom), int(y1 * zoom)) for x0, y0, x1, y1 in rects_pdf]
 
 
 
289
  boxed = draw_rects(page_img, rects_px)
290
+ return "Found.", "", boxed, preview
291
 
292
+ # fallback: render + tesseract
293
  rects_px, msg = locate_in_image_tesseract(page_img, query)
294
  boxed = draw_rects(page_img, rects_px) if rects_px else page_img
295
+ return msg, "", boxed, preview
296
 
297
+ return "Unknown task.", "", None, preview
298
  finally:
299
  doc.close()
300
 
301
+ # Image
302
+ img = _to_rgb(Image.open(file_path))
303
+ preview = img
304
 
305
  if task == "Describe":
306
+ cap = blip_describe(img)
307
+ return cap, cap, None, preview
308
 
309
  if task == "OCR":
310
  txt = trocr_ocr(img)
311
+ return txt, txt, None, preview
312
 
313
  if task == "Markdown":
314
+ md = trocr_ocr(img)
315
+ return md, md, None, preview
 
316
 
317
  if task == "Locate":
318
+ if not (query or "").strip():
319
+ return "Enter query.", "", img, preview
 
320
  rects_px, msg = locate_in_image_tesseract(img, query)
321
  boxed = draw_rects(img, rects_px) if rects_px else img
322
+ return msg, "", boxed, preview
323
 
324
+ return "Unknown task.", "", None, preview
325
 
326
 
327
  # -------------------------
328
+ # UI wiring
329
  # -------------------------
330
+ def update_page_ui(file_path: str):
331
  if not file_path:
332
+ return gr.update(visible=False), None
333
 
334
  ext = os.path.splitext(file_path)[1].lower()
335
  if ext != ".pdf":
336
+ return gr.update(visible=False), _to_rgb(Image.open(file_path))
337
 
338
  doc = fitz.open(file_path)
339
+ pages = max(1, len(doc))
340
  doc.close()
341
 
 
342
  _, _, img, _ = render_pdf_page(file_path, 1, dpi=DEFAULT_DPI)
343
+ return gr.update(visible=True, minimum=1, maximum=pages, value=1), img
 
 
 
344
 
345
 
346
  def update_preview(file_path: str, page_num: int):
 
358
 
359
 
360
  # -------------------------
361
+ # Minimal UI style
362
  # -------------------------
363
+ theme = gr.themes.Monochrome(
364
+ font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"]
365
  )
366
 
367
  with gr.Blocks(theme=theme, title="Doc Tool (CPU)") as demo:
368
  with gr.Row():
369
  with gr.Column(scale=1, min_width=320):
370
  file_in = gr.File(label="File", file_types=["image", ".pdf"], type="filepath")
371
+ page = gr.Slider(label="Page", minimum=1, maximum=1, value=1, step=1, visible=False)
372
  task = gr.Dropdown(label="Task", choices=TASKS, value="OCR")
373
+ query = gr.Textbox(label="Query", placeholder="Text to locate", visible=False)
 
374
  run_btn = gr.Button("Run", variant="primary")
375
 
376
  with gr.Column(scale=2):
377
+ with gr.Row():
378
+ preview = gr.Image(label="Preview", type="pil", height=320)
379
+ boxes = gr.Image(label="Boxes", type="pil", height=320)
380
+ out = gr.Textbox(label="Output", lines=10)
 
381
 
382
+ file_in.change(update_page_ui, inputs=[file_in], outputs=[page, preview])
383
+ page.change(update_preview, inputs=[file_in, page], outputs=[preview])
384
  task.change(toggle_query, inputs=[task], outputs=[query])
385
 
386
+ def on_run(fp, t, p, q):
387
+ text, _, boxed, prev = process(fp, t, int(p), q or "")
388
+ # keep preview stable; boxes only when relevant
389
+ return prev, boxed, as_text_block(text)
390
 
391
+ run_btn.click(on_run, inputs=[file_in, task, page, query], outputs=[preview, boxes, out])
 
 
 
 
392
 
393
  if __name__ == "__main__":
 
394
  demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)