drewThomasson commited on
Commit
007c55d
·
verified ·
1 Parent(s): 991beb3

Upload 10 files

Browse files
Files changed (11) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +36 -0
  3. HF_README.md +22 -0
  4. app.py +399 -0
  5. llma quotation.pdf +3 -0
  6. requirements.txt +4 -0
  7. static/app.js +791 -0
  8. static/app2.js +751 -0
  9. static/app3.js +831 -0
  10. static/style.css +240 -0
  11. templates/index.html +119 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ llma[[:space:]]quotation.pdf filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # System dependencies for OCR (tesseract + ghostscript) and PDF rendering
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ tesseract-ocr \
6
+ tesseract-ocr-eng \
7
+ ghostscript \
8
+ libglib2.0-0 \
9
+ libsm6 \
10
+ libxext6 \
11
+ libxrender1 \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Create a non-root user (Hugging Face Spaces requirement)
15
+ RUN useradd -m -u 1000 user
16
+ ENV HOME=/home/user \
17
+ PATH=/home/user/.local/bin:$PATH
18
+
19
+ WORKDIR /home/user/app
20
+
21
+ # Install Python dependencies
22
+ COPY requirements.txt .
23
+ RUN pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Copy application code
26
+ COPY . .
27
+
28
+ # Ensure the non-root user owns the app directory
29
+ RUN chown -R user:user /home/user/app
30
+
31
+ USER user
32
+
33
+ # Hugging Face Spaces expects port 7860
34
+ EXPOSE 7860
35
+
36
+ CMD ["python", "app.py", "--host", "0.0.0.0", "--port", "7860"]
HF_README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: DocSearcher
3
+ emoji: 🔍
4
+ colorFrom: yellow
5
+ colorTo: orange
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ license: mit
10
+ short_description: Search keywords in PDFs with OCR support
11
+ ---
12
+
13
+ # DocSearcher (5th Gen)
14
+
15
+ Easy searching of multiple keywords in PDFs with OCR support.
16
+
17
+ ## Features
18
+
19
+ - Upload any PDF and search for exact words across all pages
20
+ - Optional OCR (Optical Character Recognition) for scanned documents
21
+ - Visual highlighting of matched words on rendered page images
22
+ - Download OCR-processed PDFs
app.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import time
4
+ import uuid
5
+ import argparse
6
+ import tempfile
7
+ import traceback
8
+ from dataclasses import dataclass, field
9
+ from typing import Dict, List, Tuple, Optional
10
+
11
+ from flask import (
12
+ Flask, render_template, request, jsonify, send_file, abort
13
+ )
14
+ from PIL import Image
15
+ import fitz # PyMuPDF
16
+ import ocrmypdf # OCR engine
17
+
18
+ # -----------------------------------------
19
+ # Configuration
20
+ # -----------------------------------------
21
+ IMAGE_DPI_SCALE = 1.6 # Page rendering zoom (1.0 = 72dpi)
22
+ PAGE_IMAGE_FORMAT = "PNG"
23
+ HIGHLIGHT_COLOR = "#FFA800"
24
+ DOC_EXPIRY_SECONDS = 60 * 60 # 1 hour inactivity
25
+ CLEAN_INTERVAL_SECONDS = 600 # Cleanup frequency
26
+ MAX_PAGES = 3000 # Indexing safety
27
+ MAX_FILE_SIZE_MB = 800 # Raised size limit (adjust as desired)
28
+
29
+ # OCR configuration
30
+ OCR_DESKEW = True
31
+ OCR_OPTIMIZE = 3
32
+ OCR_SKIP_TEXT = True
33
+ OCR_MAX_PAGES = 5000
34
+ OCR_TIMEOUT_SECONDS = 1800
35
+ OCR_ROTATE_PAGES = True
36
+ OCR_ROTATE_PAGES_THRESHOLD = 1.0
37
+ DEBUG_OCR_ERRORS = True
38
+
39
+ app = Flask(__name__)
40
+ app.config['MAX_CONTENT_LENGTH'] = MAX_FILE_SIZE_MB * 1024 * 1024
41
+
42
+ # -----------------------------------------
43
+ # Data Structures
44
+ # -----------------------------------------
45
+ @dataclass
46
+ class PageWord:
47
+ text: str
48
+ bbox: Tuple[float, float, float, float] # normalized
49
+
50
+ @dataclass
51
+ class DocumentData:
52
+ doc_id: str
53
+ filename: str
54
+ pages: int
55
+ uploaded_at: float
56
+ last_access: float
57
+ original_pdf_path: str
58
+ ocr_pdf_path: Optional[str]
59
+ active_pdf_path: str
60
+ ocr_performed: bool
61
+ ocr_failed: bool
62
+ ocr_message: Optional[str] = None
63
+ ocr_time: Optional[float] = None
64
+ page_text: Dict[int, str] = field(default_factory=dict)
65
+ page_words: Dict[int, List[PageWord]] = field(default_factory=dict)
66
+ page_image_cache: Dict[int, bytes] = field(default_factory=dict)
67
+
68
+ def touch(self):
69
+ self.last_access = time.time()
70
+
71
+ # -----------------------------------------
72
+ # In-Memory Store
73
+ # -----------------------------------------
74
+ class DocumentStore:
75
+ def __init__(self):
76
+ self._docs: Dict[str, DocumentData] = {}
77
+ self._last_clean = 0.0
78
+
79
+ def add(self, doc: DocumentData):
80
+ self._docs[doc.doc_id] = doc
81
+
82
+ def get(self, doc_id: str) -> Optional[DocumentData]:
83
+ doc = self._docs.get(doc_id)
84
+ if doc:
85
+ doc.touch()
86
+ return doc
87
+
88
+ def cleanup(self):
89
+ now = time.time()
90
+ if now - self._last_clean < CLEAN_INTERVAL_SECONDS:
91
+ return
92
+ stale = [k for k, v in self._docs.items() if now - v.last_access > DOC_EXPIRY_SECONDS]
93
+ for sid in stale:
94
+ d = self._docs[sid]
95
+ try:
96
+ if os.path.exists(d.original_pdf_path):
97
+ os.remove(d.original_pdf_path)
98
+ except Exception:
99
+ pass
100
+ if d.ocr_pdf_path:
101
+ try:
102
+ if os.path.exists(d.ocr_pdf_path):
103
+ os.remove(d.ocr_pdf_path)
104
+ except Exception:
105
+ pass
106
+ del self._docs[sid]
107
+ self._last_clean = now
108
+
109
+ store = DocumentStore()
110
+
111
+ # -----------------------------------------
112
+ # PDF / OCR Helpers
113
+ # -----------------------------------------
114
+ def extract_pdf(pdf_path: str) -> Tuple[Dict[int, str], Dict[int, List[PageWord]]]:
115
+ page_text: Dict[int, str] = {}
116
+ page_words: Dict[int, List[PageWord]] = {}
117
+ doc = fitz.open(pdf_path)
118
+ try:
119
+ if len(doc) > MAX_PAGES:
120
+ raise ValueError(f"PDF exceeds page limit ({MAX_PAGES}).")
121
+ for idx, page in enumerate(doc, start=1):
122
+ page_text[idx] = page.get_text()
123
+ w, h = page.rect.width, page.rect.height
124
+ words_raw = page.get_text("words")
125
+ tokens: List[PageWord] = []
126
+ for wr in words_raw:
127
+ if len(wr) >= 5:
128
+ x0, y0, x1, y1, txt = wr[0], wr[1], wr[2], wr[3], wr[4]
129
+ if txt.strip():
130
+ tokens.append(PageWord(txt, (x0 / w, y0 / h, x1 / w, y1 / h)))
131
+ page_words[idx] = tokens
132
+ finally:
133
+ doc.close()
134
+ return page_text, page_words
135
+
136
+ def render_page_image(pdf_path: str, page_number: int, zoom: float) -> bytes:
137
+ doc = fitz.open(pdf_path)
138
+ try:
139
+ page = doc[page_number - 1]
140
+ mat = fitz.Matrix(zoom, zoom)
141
+ pix = page.get_pixmap(matrix=mat, alpha=False)
142
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
143
+ buf = io.BytesIO()
144
+ img.save(buf, format=PAGE_IMAGE_FORMAT)
145
+ return buf.getvalue()
146
+ finally:
147
+ doc.close()
148
+
149
+ def parse_query_words(raw: str) -> List[str]:
150
+ import re
151
+ tokens = re.split(r"[,\s;]+", raw.strip())
152
+ out = []
153
+ seen = set()
154
+ for t in tokens:
155
+ if not t:
156
+ continue
157
+ lt = t.lower()
158
+ if lt not in seen:
159
+ seen.add(lt)
160
+ out.append(lt)
161
+ return out
162
+
163
+ def find_pages_with_words(doc_data: DocumentData, words: List[str]):
164
+ results = []
165
+ targets = set(words)
166
+ for pnum, toks in doc_data.page_words.items():
167
+ counts = {w: 0 for w in words}
168
+ any_match = False
169
+ for tok in toks:
170
+ low = tok.text.lower()
171
+ if low in targets:
172
+ counts[low] += 1
173
+ any_match = True
174
+ if any_match:
175
+ results.append({
176
+ "page": pnum,
177
+ "counts": counts,
178
+ "total_matches": sum(counts.values())
179
+ })
180
+ results.sort(key=lambda r: r["page"])
181
+ return results
182
+
183
+ def perform_ocr(original_path: str, doc_id: str, lang: str):
184
+ try:
185
+ with fitz.open(original_path) as probe:
186
+ if len(probe) > OCR_MAX_PAGES:
187
+ return original_path, False, True, f"OCR aborted: exceeds {OCR_MAX_PAGES} pages.", None, 0.0
188
+ except Exception as e:
189
+ return original_path, False, True, f"OCR inspection failed: {e}", None, 0.0
190
+
191
+ out_path = os.path.join(tempfile.gettempdir(), f"{doc_id}_ocr.pdf")
192
+ if os.path.exists(out_path):
193
+ try:
194
+ os.remove(out_path)
195
+ except Exception:
196
+ pass
197
+
198
+ ocr_args = dict(
199
+ language=lang or "eng",
200
+ deskew=OCR_DESKEW,
201
+ optimize=OCR_OPTIMIZE,
202
+ skip_text=OCR_SKIP_TEXT,
203
+ tesseract_timeout=OCR_TIMEOUT_SECONDS,
204
+ rotate_pages=OCR_ROTATE_PAGES,
205
+ rotate_pages_threshold=OCR_ROTATE_PAGES_THRESHOLD,
206
+ )
207
+ start = time.time()
208
+ try:
209
+ ocrmypdf.ocr(original_path, out_path, **ocr_args)
210
+ elapsed = time.time() - start
211
+ if not os.path.exists(out_path) or os.path.getsize(out_path) == 0:
212
+ return original_path, True, True, "OCR produced no output.", None, elapsed
213
+ return out_path, True, False, f"OCR (rotate+deskew) completed in {elapsed:.1f}s.", out_path, elapsed
214
+ except Exception as e:
215
+ elapsed = time.time() - start
216
+ tb = traceback.format_exc()
217
+ msg = f"OCR failed after {elapsed:.1f}s: {e}"
218
+ if DEBUG_OCR_ERRORS:
219
+ msg += f"\n{tb}"
220
+ return original_path, True, True, msg, None, elapsed
221
+
222
+ # -----------------------------------------
223
+ # Routes
224
+ # -----------------------------------------
225
+ @app.route("/")
226
+ def index():
227
+ store.cleanup()
228
+ return render_template("index.html", highlight_color=HIGHLIGHT_COLOR)
229
+
230
+ @app.post("/api/upload")
231
+ def api_upload():
232
+ store.cleanup()
233
+ up = request.files.get("pdf")
234
+ if not up:
235
+ return jsonify({"error": "No file uploaded"}), 400
236
+ if not up.filename.lower().endswith(".pdf"):
237
+ return jsonify({"error": "File must be a PDF"}), 400
238
+
239
+ up.seek(0, os.SEEK_END)
240
+ size_mb = up.tell() / (1024 * 1024)
241
+ up.seek(0)
242
+ if size_mb > MAX_FILE_SIZE_MB:
243
+ return jsonify({"error": f"File too large (> {MAX_FILE_SIZE_MB} MB)"}), 400
244
+
245
+ do_ocr = request.form.get("ocr", "false").lower() == "true"
246
+ lang = request.form.get("lang", "eng").strip() or "eng"
247
+
248
+ doc_id = uuid.uuid4().hex
249
+ orig_path = os.path.join(tempfile.gettempdir(), f"upload_{doc_id}.pdf")
250
+ up.save(orig_path)
251
+
252
+ if do_ocr:
253
+ active_path, ocr_performed, ocr_failed, ocr_message, ocr_pdf_path, ocr_time = perform_ocr(
254
+ orig_path, doc_id, lang
255
+ )
256
+ else:
257
+ active_path = orig_path
258
+ ocr_performed = False
259
+ ocr_failed = False
260
+ ocr_message = None
261
+ ocr_pdf_path = None
262
+ ocr_time = None
263
+
264
+ try:
265
+ page_text, page_words = extract_pdf(active_path)
266
+ except Exception as e:
267
+ try:
268
+ os.remove(orig_path)
269
+ except Exception:
270
+ pass
271
+ if ocr_pdf_path:
272
+ try:
273
+ os.remove(ocr_pdf_path)
274
+ except Exception:
275
+ pass
276
+ return jsonify({"error": f"Failed to process PDF: {e}"}), 500
277
+
278
+ doc_data = DocumentData(
279
+ doc_id=doc_id,
280
+ filename=up.filename,
281
+ pages=len(page_text),
282
+ uploaded_at=time.time(),
283
+ last_access=time.time(),
284
+ original_pdf_path=orig_path,
285
+ ocr_pdf_path=ocr_pdf_path,
286
+ active_pdf_path=active_path,
287
+ ocr_performed=ocr_performed,
288
+ ocr_failed=ocr_failed,
289
+ ocr_message=ocr_message,
290
+ ocr_time=ocr_time,
291
+ page_text=page_text,
292
+ page_words=page_words
293
+ )
294
+ store.add(doc_data)
295
+
296
+ return jsonify({
297
+ "doc_id": doc_id,
298
+ "filename": up.filename,
299
+ "pages": doc_data.pages,
300
+ "ocr_performed": ocr_performed,
301
+ "ocr_failed": ocr_failed,
302
+ "ocr_message": ocr_message,
303
+ "ocr_time_seconds": ocr_time,
304
+ "used_ocr_pdf": (active_path != orig_path),
305
+ "rotate_pages": OCR_ROTATE_PAGES if do_ocr else False,
306
+ "rotate_threshold": OCR_ROTATE_PAGES_THRESHOLD if do_ocr else None
307
+ })
308
+
309
+ @app.get("/api/doc/<doc_id>/meta")
310
+ def api_doc_meta(doc_id):
311
+ d = store.get(doc_id)
312
+ if not d:
313
+ return jsonify({"error": "Not found"}), 404
314
+ return jsonify({
315
+ "doc_id": d.doc_id,
316
+ "filename": d.filename,
317
+ "pages": d.pages,
318
+ "ocr_performed": d.ocr_performed,
319
+ "ocr_failed": d.ocr_failed,
320
+ "ocr_message": d.ocr_message,
321
+ "ocr_time_seconds": d.ocr_time,
322
+ "download_ocr_url": f"/api/doc/{doc_id}/download/ocr"
323
+ if d.ocr_performed and not d.ocr_failed and d.ocr_pdf_path else None
324
+ })
325
+
326
+ @app.get("/api/doc/<doc_id>/download/ocr")
327
+ def api_download_ocr(doc_id):
328
+ d = store.get(doc_id)
329
+ if not d:
330
+ return jsonify({"error": "Not found"}), 404
331
+ if not d.ocr_pdf_path or not os.path.exists(d.ocr_pdf_path):
332
+ return jsonify({"error": "No OCR PDF available"}), 404
333
+ return send_file(d.ocr_pdf_path, mimetype="application/pdf", as_attachment=True,
334
+ download_name=f"{d.doc_id}_ocr.pdf")
335
+
336
+ @app.post("/api/doc/<doc_id>/search")
337
+ def api_search(doc_id):
338
+ d = store.get(doc_id)
339
+ if not d:
340
+ return jsonify({"error": "Not found"}), 404
341
+ payload = request.get_json(silent=True) or {}
342
+ words = parse_query_words(payload.get("words", ""))
343
+ if not words:
344
+ return jsonify({"words": [], "results": []})
345
+ results = find_pages_with_words(d, words)
346
+ return jsonify({"words": words, "results": results})
347
+
348
+ @app.get("/api/doc/<doc_id>/page/<int:page_num>")
349
+ def api_page(doc_id, page_num: int):
350
+ d = store.get(doc_id)
351
+ if not d:
352
+ return jsonify({"error": "Not found"}), 404
353
+ if page_num < 1 or page_num > d.pages:
354
+ return jsonify({"error": "Invalid page"}), 400
355
+
356
+ if page_num not in d.page_image_cache:
357
+ try:
358
+ d.page_image_cache[page_num] = render_page_image(d.active_pdf_path, page_num, IMAGE_DPI_SCALE)
359
+ except Exception as e:
360
+ return jsonify({"error": f"Failed to render page: {e}"}), 500
361
+
362
+ tokens = [{"text": w.text, "bbox": w.bbox} for w in d.page_words[page_num]]
363
+
364
+ return jsonify({
365
+ "page": page_num,
366
+ "tokens": tokens,
367
+ "text": d.page_text.get(page_num, ""),
368
+ "image_url": f"/api/doc/{doc_id}/page/{page_num}/image"
369
+ })
370
+
371
+ @app.get("/api/doc/<doc_id>/page/<int:page_num>/image")
372
+ def api_page_image(doc_id, page_num):
373
+ d = store.get(doc_id)
374
+ if not d:
375
+ abort(404)
376
+ if page_num < 1 or page_num > d.pages:
377
+ abort(400)
378
+ if page_num not in d.page_image_cache:
379
+ try:
380
+ d.page_image_cache[page_num] = render_page_image(d.active_pdf_path, page_num, IMAGE_DPI_SCALE)
381
+ except Exception:
382
+ abort(500)
383
+ return send_file(
384
+ io.BytesIO(d.page_image_cache[page_num]),
385
+ mimetype="image/png",
386
+ as_attachment=False,
387
+ download_name=f"{doc_id}_page_{page_num}.png"
388
+ )
389
+
390
+ def main():
391
+ parser = argparse.ArgumentParser(description="Run PDF Word Finder with OCR (auto-rotate).")
392
+ parser.add_argument("--host", default="127.0.0.1")
393
+ parser.add_argument("--port", type=int, default=8000)
394
+ parser.add_argument("--debug", action="store_true")
395
+ args = parser.parse_args()
396
+ app.run(host=args.host, port=args.port, debug=args.debug)
397
+
398
+ if __name__ == "__main__":
399
+ main()
llma quotation.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f858dc4dc674981eb4817c470dd6036fbd60d35028d940857795f747a9fe9bc
3
+ size 368451
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ flask>=3.1,<4
2
+ pymupdf>=1.27,<2
3
+ pillow>=12.0,<13
4
+ ocrmypdf>=17.0,<18
static/app.js ADDED
@@ -0,0 +1,791 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Front-end logic: full placeholder list + background image/data loading + buffered highlights */
2
+ console.log("[app] build version:", window.APP_CONFIG?.buildVersion);
3
+
4
+ /* ---------- DOM Elements ---------- */
5
+ const pdfInput = document.getElementById('pdfInput');
6
+ const fileInfo = document.getElementById('fileInfo');
7
+ const wordsInput = document.getElementById('wordsInput');
8
+ const searchBtn = document.getElementById('searchBtn');
9
+ const resultsList = document.getElementById('resultsList');
10
+ const pageText = document.getElementById('pageText');
11
+ const legend = document.getElementById('legend');
12
+ const pagesDiv = document.getElementById('pages');
13
+ const statusMsg = document.getElementById('statusMsg');
14
+ const zoomIn = document.getElementById('zoomIn');
15
+ const zoomOut = document.getElementById('zoomOut');
16
+ const zoomVal = document.getElementById('zoomVal');
17
+ const divider = document.getElementById('divider');
18
+ const loadAllBtn = document.getElementById('loadAllBtn');
19
+ const ocrToggle = document.getElementById('ocrToggle');
20
+ const ocrLang = document.getElementById('ocrLang');
21
+ const downloadOcrLink = document.getElementById('downloadOcrLink');
22
+ const ocrStatusNote = document.getElementById('ocrStatusNote');
23
+
24
+ /* Overlay */
25
+ const processingOverlay = document.getElementById('processingOverlay');
26
+ const processingTitle = document.getElementById('processingTitle');
27
+ const processingDetail = document.getElementById('processingDetail');
28
+ const processingHint = document.getElementById('processingHint');
29
+ const processingError = document.getElementById('processingError');
30
+ const overlayCloseBtn = document.getElementById('overlayCloseBtn');
31
+ const processingSpinner = document.getElementById('processingSpinner');
32
+
33
+ /* ---------- Config ---------- */
34
+ ocrToggle.checked = false;
35
+ ocrLang.value = 'eng';
36
+
37
+ const HIGHLIGHT_BUFFER_BEFORE = 2;
38
+ const HIGHLIGHT_BUFFER_AFTER = 2;
39
+ const PREFETCH_EXTRA_AHEAD = 1;
40
+ let bufferedHighlightMode = true;
41
+
42
+ const ALWAYS_BACKGROUND_FULL_LOAD = true;
43
+ const BG_CONCURRENCY_BASE = 8;
44
+ const BG_CONCURRENCY_ACCEL = 24;
45
+
46
+ const LARGE_DOC_THRESHOLD = 80;
47
+ const PREVIEW_PAGES_LARGE = 10;
48
+ const PREVIEW_PAGES_SMALL = Infinity;
49
+
50
+ let currentScale = 1.0;
51
+ const MIN_SCALE = 0.5;
52
+ const MAX_SCALE = 2.5;
53
+ const SCALE_STEP = 0.15;
54
+
55
+ /* ---------- State ---------- */
56
+ let currentDoc = null;
57
+ let currentWords = [];
58
+ let searchResults = [];
59
+ let matchPageSet = new Set();
60
+
61
+ let pageCache = {}; // pageNum -> { tokens, text, imageLoadedPromise, overlay }
62
+ let pageLoadPromises = {}; // guard
63
+ let placeholderBuilt = false;
64
+
65
+ let seamlessHighlightActive = false;
66
+ let currentCenterPage = null;
67
+ let highlightedPages = new Set();
68
+ let scrollDirection = 0;
69
+ let programmaticScrollInProgress = false;
70
+ let pageObserver = null;
71
+ let jumpGeneration = 0;
72
+
73
+ let bgActive = false;
74
+ let bgAccelerated = false;
75
+ let bgCompleted = false;
76
+ let bgLoadedCount = 0;
77
+ let bgTotalToLoad = 0;
78
+ let bgConcurrency = BG_CONCURRENCY_BASE;
79
+ let bgStatusTimer = null;
80
+
81
+ /* ---------- Utility ---------- */
82
+ function setStatus(msg) { statusMsg.textContent = msg; }
83
+ function parseWords(raw) {
84
+ return raw.trim()
85
+ .split(/[,\s;]+/)
86
+ .filter(Boolean)
87
+ .map(w => w.toLowerCase())
88
+ .filter((v,i,a)=>a.indexOf(v)===i);
89
+ }
90
+
91
+ /* ---------- Overlay Helpers ---------- */
92
+ function showProcessingOverlay(title, detail, showHint=true) {
93
+ processingTitle.textContent = title;
94
+ processingDetail.textContent = detail || '';
95
+ processingHint.style.display = showHint ? 'block' : 'none';
96
+ processingError.style.display = 'none';
97
+ overlayCloseBtn.style.display = 'none';
98
+ processingSpinner.style.display = 'block';
99
+ processingOverlay.classList.remove('hidden');
100
+ }
101
+ function markOverlayCompleted(msg) {
102
+ processingTitle.textContent = 'Completed';
103
+ processingDetail.textContent = msg || 'Done.';
104
+ processingHint.style.display = 'none';
105
+ processingSpinner.style.display = 'none';
106
+ setTimeout(()=>overlayCloseBtn.style.display = 'inline-flex', 2000);
107
+ }
108
+ function showOverlayError(msg) {
109
+ processingError.textContent = msg;
110
+ processingError.style.display = 'block';
111
+ processingSpinner.style.display = 'none';
112
+ processingTitle.textContent = 'Error';
113
+ processingHint.style.display = 'none';
114
+ overlayCloseBtn.style.display = 'inline-flex';
115
+ }
116
+ function hideProcessingOverlay() {
117
+ processingOverlay.classList.add('hidden');
118
+ }
119
+ overlayCloseBtn.addEventListener('click', hideProcessingOverlay);
120
+
121
+ /* ---------- Upload Flow ---------- */
122
+ pdfInput.addEventListener('change', async (e) => {
123
+ const f = e.target.files[0];
124
+ if (!f) return;
125
+ resetAll();
126
+
127
+ const wantsOCR = ocrToggle.checked;
128
+ showProcessingOverlay(
129
+ wantsOCR ? 'Performing OCR...' : 'Processing PDF...',
130
+ wantsOCR ? 'Running OCR (deskew + rotation). Please wait...' : 'Processing PDF text...',
131
+ wantsOCR
132
+ );
133
+
134
+ setStatus("Uploading...");
135
+ const fd = new FormData();
136
+ fd.append("pdf", f);
137
+ fd.append("ocr", String(wantsOCR));
138
+ fd.append("lang", ocrLang.value.trim() || 'eng');
139
+
140
+ let json;
141
+ try {
142
+ const res = await fetch("/api/upload", {method:"POST", body:fd});
143
+ json = await res.json();
144
+ if (!res.ok) throw new Error(json.error || "Upload failed");
145
+ } catch (err) {
146
+ console.error(err);
147
+ showOverlayError(err.message);
148
+ setStatus(err.message);
149
+ return;
150
+ }
151
+
152
+ currentDoc = json;
153
+ fileInfo.textContent = `${json.filename} (${json.pages} pages)`;
154
+ enableLoadAllIfNeeded();
155
+ enableZoom();
156
+
157
+ // OCR status
158
+ if (json.ocr_performed) {
159
+ ocrStatusNote.style.display = 'block';
160
+ ocrStatusNote.textContent = json.ocr_failed
161
+ ? `OCR failed: ${json.ocr_message || 'Unknown error.'}`
162
+ : (json.ocr_message || 'OCR completed.');
163
+ } else {
164
+ ocrStatusNote.style.display = 'none';
165
+ }
166
+
167
+ // Download OCR link
168
+ if (json.ocr_performed && !json.ocr_failed && json.used_ocr_pdf) {
169
+ try {
170
+ const metaRes = await fetch(`/api/doc/${json.doc_id}/meta`);
171
+ const metaJ = await metaRes.json();
172
+ if (metaRes.ok && metaJ.download_ocr_url) {
173
+ downloadOcrLink.href = metaJ.download_ocr_url;
174
+ downloadOcrLink.style.display = 'inline-flex';
175
+ }
176
+ } catch(e) {}
177
+ }
178
+
179
+ markOverlayCompleted("Preview rendering...");
180
+
181
+ try {
182
+ await renderPreviewPages();
183
+ buildAllPlaceholders(); // create placeholders for ALL pages (if not built)
184
+ if (ALWAYS_BACKGROUND_FULL_LOAD) {
185
+ startBackgroundLoading(); // load every remaining page automatically
186
+ }
187
+ setStatus("Preview ready. You can search now.");
188
+ } catch (e2) {
189
+ showOverlayError("Render error: " + e2.message);
190
+ setStatus("Render error: " + e2.message);
191
+ return;
192
+ } finally {
193
+ setTimeout(hideProcessingOverlay, 500);
194
+ }
195
+ });
196
+
197
+ /* ---------- Preview Pages ---------- */
198
+ async function renderPreviewPages() {
199
+ if (!currentDoc) return;
200
+ const total = currentDoc.pages;
201
+ const limit = (total > LARGE_DOC_THRESHOLD) ? PREVIEW_PAGES_LARGE : PREVIEW_PAGES_SMALL;
202
+ const count = Math.min(limit, total);
203
+ for (let p = 1; p <= count; p++) {
204
+ await ensurePageLoaded(p);
205
+ if (p % 3 === 0 || p === count) {
206
+ setStatus(`Loaded preview pages ${p}/${count}${count < total ? '...' : ''}`);
207
+ }
208
+ }
209
+ }
210
+
211
+ /* ---------- Placeholder Construction ---------- */
212
+ function buildAllPlaceholders() {
213
+ if (!currentDoc || placeholderBuilt) return;
214
+ const total = currentDoc.pages;
215
+ // We keep already loaded preview pages; build placeholders for any missing pages + also create placeholders for those already loaded? prefer consistent DOM order.
216
+ // Strategy: If a page DOM exists skip; else create placeholder.
217
+ const frag = document.createDocumentFragment();
218
+ for (let p = 1; p <= total; p++) {
219
+ if (pagesDiv.querySelector(`.page[data-page="${p}"]`)) continue;
220
+ const ph = document.createElement('div');
221
+ ph.className = 'page placeholder';
222
+ ph.dataset.page = p;
223
+ ph.innerHTML = `
224
+ <div class="page-inner">
225
+ <div class="placeholder-label">Page ${p}</div>
226
+ <div class="placeholder-spinner"></div>
227
+ </div>
228
+ `;
229
+ frag.appendChild(ph);
230
+ }
231
+ // Insert placeholders maintaining numeric order (append because existing pages are lowest numbers already)
232
+ pagesDiv.appendChild(frag);
233
+ placeholderBuilt = true;
234
+ }
235
+
236
+ /* ---------- Background Loading (All Pages) ---------- */
237
+ async function startBackgroundLoading(accelerate = false) {
238
+ if (!currentDoc) return;
239
+ if (bgCompleted) return;
240
+ if (!bgActive) {
241
+ bgActive = true;
242
+ bgConcurrency = accelerate ? BG_CONCURRENCY_ACCEL : BG_CONCURRENCY_BASE;
243
+ } else if (accelerate) {
244
+ bgAccelerated = true;
245
+ bgConcurrency = BG_CONCURRENCY_ACCEL;
246
+ }
247
+
248
+ const pending = [];
249
+ for (let p = 1; p <= currentDoc.pages; p++) {
250
+ if (!pageCache[p]) pending.push(p);
251
+ }
252
+ bgTotalToLoad = pending.length;
253
+ bgLoadedCount = 0;
254
+
255
+ if (!pending.length) {
256
+ bgCompleted = true;
257
+ bgActive = false;
258
+ enableLoadAllIfNeeded();
259
+ setStatus("All pages already loaded.");
260
+ return;
261
+ }
262
+
263
+ if (!bgStatusTimer) {
264
+ bgStatusTimer = setInterval(() => {
265
+ if (!bgActive) return;
266
+ const pct = ((bgLoadedCount / bgTotalToLoad) * 100).toFixed(1);
267
+ setStatus(`Loading all pages (${bgLoadedCount}/${bgTotalToLoad}) ${pct}%`);
268
+ }, 1200);
269
+ }
270
+
271
+ let nextIndex = 0;
272
+ async function worker() {
273
+ while (true) {
274
+ if (nextIndex >= pending.length) break;
275
+ const i = nextIndex++;
276
+ const pageNum = pending[i];
277
+ try {
278
+ await ensurePageLoaded(pageNum);
279
+ } catch (e) {
280
+ console.warn("[bg] page load error", pageNum, e);
281
+ } finally {
282
+ bgLoadedCount++;
283
+ }
284
+ }
285
+ }
286
+
287
+ const workers = [];
288
+ for (let i = 0; i < bgConcurrency; i++) workers.push(worker());
289
+
290
+ await Promise.all(workers);
291
+
292
+ if (bgAccelerated && !accelerate) {
293
+ // If we were asked to accelerate after initial start, spawn extra workers now
294
+ // (Simplify: we already adjust concurrency variable; new acceleration triggers call again)
295
+ }
296
+
297
+ clearInterval(bgStatusTimer);
298
+ bgStatusTimer = null;
299
+ bgActive = false;
300
+ bgCompleted = true;
301
+ enableLoadAllIfNeeded();
302
+ setStatus("All pages loaded.");
303
+ }
304
+
305
+ /* Load All button -> accelerate */
306
+ loadAllBtn.addEventListener('click', async () => {
307
+ if (!currentDoc) return;
308
+ if (bgCompleted) {
309
+ setStatus("All pages already loaded.");
310
+ return;
311
+ }
312
+ setStatus("Accelerating full load...");
313
+ await startBackgroundLoading(true);
314
+ // If background already active, above call only bumps concurrency.
315
+ if (bgActive) {
316
+ const wait = setInterval(() => {
317
+ if (bgCompleted) clearInterval(wait);
318
+ }, 400);
319
+ }
320
+ });
321
+
322
+ /* ---------- Search ---------- */
323
+ searchBtn.addEventListener('click', runSearch);
324
+ wordsInput.addEventListener('keydown', e => { if (e.key === 'Enter') runSearch(); });
325
+
326
+ async function runSearch() {
327
+ if (!currentDoc) { setStatus("Upload a PDF first."); return; }
328
+ const raw = wordsInput.value;
329
+ const words = parseWords(raw);
330
+ currentWords = words;
331
+ updateLegend(words);
332
+ clearAllHighlights();
333
+ seamlessHighlightActive = false;
334
+ matchPageSet.clear();
335
+ highlightedPages.clear();
336
+ currentCenterPage = null;
337
+
338
+ if (!words.length) {
339
+ resultsList.innerHTML = '';
340
+ pageText.value = '';
341
+ setStatus("No words entered.");
342
+ return;
343
+ }
344
+
345
+ setStatus("Searching...");
346
+ let data;
347
+ try {
348
+ const res = await fetch(`/api/doc/${currentDoc.doc_id}/search`, {
349
+ method:"POST",
350
+ headers: {"Content-Type":"application/json"},
351
+ body: JSON.stringify({words: raw})
352
+ });
353
+ data = await res.json();
354
+ if (!res.ok) throw new Error(data.error || "Search failed");
355
+ } catch (e) {
356
+ setStatus(e.message);
357
+ return;
358
+ }
359
+ searchResults = data.results || [];
360
+ populateResults();
361
+ if (!searchResults.length) {
362
+ setStatus("No pages found.");
363
+ pageText.value = '';
364
+ return;
365
+ }
366
+ matchPageSet = new Set(searchResults.map(r=>r.page));
367
+
368
+ const first = searchResults[0].page;
369
+ await ensurePageLoaded(first);
370
+ await preloadHighlightWindow(first);
371
+ setCenterPage(first, {fromClick:true});
372
+ seamlessHighlightActive = true;
373
+ selectResultIndex(0, {preserveHighlights:true, skipScroll:true});
374
+ scrollPageIntoView(first);
375
+ setStatus(`Ready. Highlight window centered at page ${first}.`);
376
+ }
377
+
378
+ function updateLegend(words) {
379
+ legend.innerHTML = '';
380
+ if (!words.length) {
381
+ legend.innerHTML = '<span class="dim">No words</span>';
382
+ return;
383
+ }
384
+ const sw = document.createElement('div');
385
+ sw.className = 'swatch';
386
+ legend.appendChild(sw);
387
+ const txt = document.createElement('div');
388
+ txt.textContent = words.join(', ');
389
+ legend.appendChild(txt);
390
+ }
391
+
392
+ function populateResults() {
393
+ resultsList.innerHTML = '';
394
+ if (!searchResults.length) {
395
+ const li = document.createElement('li');
396
+ li.textContent = '[No pages]';
397
+ li.classList.add('dim');
398
+ resultsList.appendChild(li);
399
+ return;
400
+ }
401
+ searchResults.forEach((r, idx) => {
402
+ const li = document.createElement('li');
403
+ const parts = [];
404
+ currentWords.forEach(w => {
405
+ const c = r.counts[w] || 0;
406
+ if (c) parts.push(`${w}:${c}`);
407
+ });
408
+ li.innerHTML = `<span>Pg ${r.page}</span><span style="opacity:.7">${parts.join(', ')}</span>`;
409
+ li.addEventListener('click', () => jumpToResultPage(idx, r.page));
410
+ resultsList.appendChild(li);
411
+ });
412
+ }
413
+
414
+ /* ---------- Jump to Far Page ---------- */
415
+ async function jumpToResultPage(idx, pageNum) {
416
+ if (!currentDoc) return;
417
+ jumpGeneration++;
418
+ const myGen = jumpGeneration;
419
+ setStatus(`Jumping to page ${pageNum}...`);
420
+ programmaticScrollInProgress = true;
421
+
422
+ await ensurePageLoaded(pageNum);
423
+ if (myGen !== jumpGeneration) return;
424
+
425
+ const preloadPromise = preloadHighlightWindow(pageNum);
426
+ setCenterPage(pageNum, {fromClick:true});
427
+ selectResultIndex(idx, {preserveHighlights:true, skipScroll:true});
428
+ scrollPageIntoView(pageNum);
429
+
430
+ let timedOut = false;
431
+ const timeout = new Promise(r=>setTimeout(()=>{ timedOut = true; r(); }, 2000));
432
+ await Promise.race([preloadPromise, timeout]);
433
+ setStatus(timedOut ? `Page ${pageNum} ready (loading nearby...)` : `Centered on page ${pageNum}.`);
434
+
435
+ setTimeout(()=> programmaticScrollInProgress = false, 600);
436
+ }
437
+
438
+ async function preloadHighlightWindow(center) {
439
+ const start = Math.max(1, center - HIGHLIGHT_BUFFER_BEFORE);
440
+ const end = Math.min(currentDoc.pages, center + HIGHLIGHT_BUFFER_AFTER);
441
+ const tasks = [];
442
+ for (let p = start; p <= end; p++) {
443
+ if (!pageCache[p]) tasks.push(ensurePageLoaded(p));
444
+ }
445
+ if (tasks.length) await Promise.all(tasks);
446
+ }
447
+
448
+ /* ---------- Selecting Result ---------- */
449
+ async function selectResultIndex(idx, opts={}) {
450
+ if (idx < 0 || idx >= searchResults.length) return;
451
+ [...resultsList.children].forEach((li,i)=>li.classList.toggle('active', i===idx));
452
+ const entry = searchResults[idx];
453
+ currentSelectedPage = entry.page;
454
+ await ensurePageLoaded(entry.page);
455
+ showPageText(entry.page);
456
+
457
+ if (!bufferedHighlightMode) {
458
+ if (seamlessHighlightActive) highlightPageMatches(entry.page, {append:true});
459
+ else if (!opts.preserveHighlights) {
460
+ clearAllHighlights();
461
+ highlightPageMatches(entry.page);
462
+ }
463
+ }
464
+ if (!opts.skipScroll) scrollPageIntoView(entry.page);
465
+ }
466
+
467
+ function showPageText(pageNum) {
468
+ const cache = pageCache[pageNum];
469
+ if (!cache) return;
470
+ const entry = searchResults.find(r=>r.page===pageNum);
471
+ let summary = '';
472
+ if (entry) {
473
+ const parts = currentWords
474
+ .map(w => `${w}=${entry.counts[w] || 0}`)
475
+ .filter(x=>!x.endsWith('=0'));
476
+ if (parts.length) summary = 'Matches: '+parts.join(', ') + '\n' + '-'.repeat(40) + '\n';
477
+ }
478
+ pageText.value = summary + cache.text;
479
+ }
480
+
481
+ function scrollPageIntoView(pageNum) {
482
+ const el = pagesDiv.querySelector(`.page[data-page="${pageNum}"]`);
483
+ if (el) el.scrollIntoView({behavior:'smooth', block:'start'});
484
+ }
485
+
486
+ /* ---------- Intersection Observer (Center Detection) ---------- */
487
+ function ensurePageObserver() {
488
+ if (pageObserver) return;
489
+ pageObserver = new IntersectionObserver(handleIO, {
490
+ root: document.getElementById('pagesWrap'),
491
+ rootMargin: '0px',
492
+ threshold: [0.25,0.5,0.75]
493
+ });
494
+ // Observe all page elements (placeholders included)
495
+ pagesDiv.querySelectorAll('.page').forEach(p => pageObserver.observe(p));
496
+ }
497
+ function handleIO(entries) {
498
+ if (!bufferedHighlightMode || programmaticScrollInProgress) return;
499
+ let best = null;
500
+ for (const e of entries) {
501
+ if (!e.isIntersecting) continue;
502
+ if (!best || e.intersectionRatio > best.intersectionRatio) best = e;
503
+ }
504
+ if (!best) return;
505
+ const num = parseInt(best.target.dataset.page,10);
506
+ if (currentCenterPage !== num) {
507
+ if (currentCenterPage != null) scrollDirection = num > currentCenterPage ? 1 : -1;
508
+ setCenterPage(num);
509
+ }
510
+ }
511
+ function setCenterPage(pageNum, {fromClick=false} = {}) {
512
+ currentCenterPage = pageNum;
513
+ updateHighlightWindow();
514
+ if (fromClick) {
515
+ programmaticScrollInProgress = true;
516
+ setTimeout(()=> programmaticScrollInProgress = false, 800);
517
+ }
518
+ }
519
+
520
+ /* ---------- Highlight Window Logic ---------- */
521
+ function updateHighlightWindow() {
522
+ if (!currentDoc || !bufferedHighlightMode || currentCenterPage == null) return;
523
+ const start = Math.max(1, currentCenterPage - HIGHLIGHT_BUFFER_BEFORE);
524
+ const end = Math.min(currentDoc.pages, currentCenterPage + HIGHLIGHT_BUFFER_AFTER);
525
+
526
+ for (const p of Array.from(highlightedPages)) {
527
+ if (p < start || p > end) {
528
+ clearHighlightsOnPage(p);
529
+ highlightedPages.delete(p);
530
+ }
531
+ }
532
+
533
+ const tasks = [];
534
+ for (let p = start; p <= end; p++) {
535
+ if (matchPageSet.has(p) && !highlightedPages.has(p)) {
536
+ if (pageCache[p]) {
537
+ highlightPageMatches(p,{append:false});
538
+ highlightedPages.add(p);
539
+ } else {
540
+ tasks.push(ensurePageLoaded(p).then(()=>{
541
+ if (matchPageSet.has(p)) {
542
+ highlightPageMatches(p,{append:false});
543
+ highlightedPages.add(p);
544
+ }
545
+ }));
546
+ }
547
+ }
548
+ }
549
+
550
+ if (scrollDirection !== 0) {
551
+ const aheadStart = scrollDirection > 0 ? end + 1 : start - PREFETCH_EXTRA_AHEAD;
552
+ const aheadEnd = scrollDirection > 0
553
+ ? Math.min(currentDoc.pages, end + PREFETCH_EXTRA_AHEAD)
554
+ : Math.max(1, start - 1);
555
+ for (let p = aheadStart; scrollDirection > 0 ? p <= aheadEnd : p >= aheadEnd; p += scrollDirection>0?1:-1) {
556
+ if (matchPageSet.has(p) && !pageCache[p]) {
557
+ tasks.push(ensurePageLoaded(p));
558
+ }
559
+ }
560
+ }
561
+
562
+ if (tasks.length) {
563
+ Promise.all(tasks).catch(e=>console.warn('[highlight-window]', e));
564
+ }
565
+ }
566
+
567
+ /* ---------- Page Loading ---------- */
568
+ async function ensurePageLoaded(pageNum) {
569
+ if (pageCache[pageNum]) return;
570
+ if (pageLoadPromises[pageNum]) return pageLoadPromises[pageNum];
571
+
572
+ pageLoadPromises[pageNum] = (async () => {
573
+ if (!currentDoc) return;
574
+ // Reuse placeholder element
575
+ let pageEl = pagesDiv.querySelector(`.page[data-page="${pageNum}"]`);
576
+ if (!pageEl) {
577
+ // Should not happen if placeholders built, but fallback
578
+ pageEl = document.createElement('div');
579
+ pageEl.className = 'page placeholder';
580
+ pageEl.dataset.page = pageNum;
581
+ pageEl.innerHTML = `
582
+ <div class="page-inner">
583
+ <div class="placeholder-label">Page ${pageNum}</div>
584
+ <div class="placeholder-spinner"></div>
585
+ </div>
586
+ `;
587
+ // Insert in numeric order
588
+ let inserted = false;
589
+ const existing = [...pagesDiv.querySelectorAll('.page')];
590
+ for (const el of existing) {
591
+ const n = parseInt(el.dataset.page,10);
592
+ if (pageNum < n) {
593
+ pagesDiv.insertBefore(pageEl, el);
594
+ inserted = true;
595
+ break;
596
+ }
597
+ }
598
+ if (!inserted) pagesDiv.appendChild(pageEl);
599
+ }
600
+
601
+ // Fetch page meta
602
+ const res = await fetch(`/api/doc/${currentDoc.doc_id}/page/${pageNum}`);
603
+ const data = await res.json();
604
+ if (!res.ok) throw new Error(data.error || `Failed to load page ${pageNum}`);
605
+
606
+ // Replace placeholder inner content with actual page image + overlay only if not already replaced
607
+ if (!pageEl.classList.contains('loaded')) {
608
+ pageEl.classList.remove('placeholder');
609
+ pageEl.classList.add('loaded');
610
+ pageEl.innerHTML = ''; // clear placeholder
611
+ const img = document.createElement('img');
612
+ img.src = data.image_url;
613
+ img.alt = `Page ${pageNum}`;
614
+ img.loading = 'lazy';
615
+ img.decoding = 'async';
616
+ pageEl.appendChild(img);
617
+
618
+ const label = document.createElement('div');
619
+ label.className = 'page-label';
620
+ label.textContent = `Page ${pageNum}`;
621
+ pageEl.appendChild(label);
622
+
623
+ const overlay = document.createElement('div');
624
+ overlay.className = 'overlay';
625
+ overlay.style.position = 'absolute';
626
+ overlay.style.inset = '0';
627
+ overlay.style.pointerEvents = 'none';
628
+ pageEl.appendChild(overlay);
629
+
630
+ pageCache[pageNum] = {
631
+ tokens: data.tokens,
632
+ text: data.text,
633
+ imageLoadedPromise: new Promise(resolve => {
634
+ img.onload = () => resolve();
635
+ img.onerror = () => resolve();
636
+ }),
637
+ overlay
638
+ };
639
+ await pageCache[pageNum].imageLoadedPromise;
640
+
641
+ ensurePageObserver();
642
+ if (pageObserver) pageObserver.observe(pageEl);
643
+
644
+ if (bufferedHighlightMode && matchPageSet.has(pageNum)) {
645
+ if (currentCenterPage != null &&
646
+ pageNum >= currentCenterPage - HIGHLIGHT_BUFFER_BEFORE &&
647
+ pageNum <= currentCenterPage + HIGHLIGHT_BUFFER_AFTER) {
648
+ highlightPageMatches(pageNum);
649
+ highlightedPages.add(pageNum);
650
+ }
651
+ } else if (seamlessHighlightActive && !bufferedHighlightMode && matchPageSet.has(pageNum)) {
652
+ highlightPageMatches(pageNum, {append:true});
653
+ }
654
+ } else {
655
+ // Already loaded (race)
656
+ }
657
+ })();
658
+
659
+ try {
660
+ await pageLoadPromises[pageNum];
661
+ } finally {
662
+ delete pageLoadPromises[pageNum];
663
+ }
664
+ }
665
+
666
+ /* ---------- Highlighting ---------- */
667
+ function clearAllHighlights() {
668
+ document.querySelectorAll('.hl-box').forEach(el => el.remove());
669
+ }
670
+ function clearHighlightsOnPage(pageNum) {
671
+ const pageEl = pagesDiv.querySelector(`.page[data-page="${pageNum}"]`);
672
+ if (!pageEl) return;
673
+ pageEl.querySelectorAll('.hl-box').forEach(el=>el.remove());
674
+ }
675
+ function highlightPageMatches(pageNum, {append=false} = {}) {
676
+ const cache = pageCache[pageNum];
677
+ if (!cache || !currentWords.length) return;
678
+ if (!append) clearHighlightsOnPage(pageNum);
679
+ const targets = new Set(currentWords);
680
+ const overlay = cache.overlay;
681
+ const frag = document.createDocumentFragment();
682
+ for (const tok of cache.tokens) {
683
+ const lt = tok.text.toLowerCase();
684
+ if (targets.has(lt)) {
685
+ const [x0,y0,x1,y1] = tok.bbox;
686
+ const div = document.createElement('div');
687
+ div.className = 'hl-box';
688
+ div.style.left = (x0*100)+'%';
689
+ div.style.top = (y0*100)+'%';
690
+ div.style.width = ((x1 - x0)*100)+'%';
691
+ div.style.height = ((y1 - y0)*100)+'%';
692
+ frag.appendChild(div);
693
+ }
694
+ }
695
+ overlay.appendChild(frag);
696
+ }
697
+
698
+ /* ---------- Zoom / Resize ---------- */
699
+ window.addEventListener('resize', () => { /* percentage boxes auto-scale */ });
700
+
701
+ function enableZoom() {
702
+ zoomIn.disabled = false;
703
+ zoomOut.disabled = false;
704
+ }
705
+ function disableZoom() {
706
+ zoomIn.disabled = true;
707
+ zoomOut.disabled = true;
708
+ currentScale = 1.0;
709
+ zoomVal.textContent = '100%';
710
+ pagesDiv.style.transform = '';
711
+ }
712
+ zoomIn.addEventListener('click', () => applyZoom(currentScale + SCALE_STEP));
713
+ zoomOut.addEventListener('click', () => applyZoom(currentScale - SCALE_STEP));
714
+ function applyZoom(newScale) {
715
+ if (!currentDoc) return;
716
+ newScale = Math.min(MAX_SCALE, Math.max(MIN_SCALE, newScale));
717
+ if (Math.abs(newScale - currentScale) < 0.001) return;
718
+ currentScale = newScale;
719
+ zoomVal.textContent = Math.round(newScale*100) + '%';
720
+ pagesDiv.style.transformOrigin = 'top center';
721
+ pagesDiv.style.transform = `scale(${newScale})`;
722
+ }
723
+
724
+ /* ---------- Sidebar Resize ---------- */
725
+ (function enableDivider() {
726
+ let dragging = false;
727
+ divider.addEventListener('mousedown', () => {
728
+ dragging = true;
729
+ document.body.style.userSelect = 'none';
730
+ document.documentElement.style.cursor = 'col-resize';
731
+ });
732
+ window.addEventListener('mouseup', () => {
733
+ if (dragging) {
734
+ dragging = false;
735
+ document.body.style.userSelect = '';
736
+ document.documentElement.style.cursor = '';
737
+ }
738
+ });
739
+ window.addEventListener('mousemove', e => {
740
+ if (!dragging) return;
741
+ const min = 220;
742
+ const max = Math.min(window.innerWidth * 0.6, 700);
743
+ const w = Math.max(min, Math.min(max, e.clientX));
744
+ document.documentElement.style.setProperty('--sidebar-width', w + 'px');
745
+ });
746
+ })();
747
+
748
+ /* ---------- Load All Button State ---------- */
749
+ function enableLoadAllIfNeeded() {
750
+ if (!currentDoc) { loadAllBtn.disabled = true; return; }
751
+ loadAllBtn.disabled = bgCompleted;
752
+ }
753
+
754
+ /* ---------- Reset ---------- */
755
+ function resetAll() {
756
+ currentDoc = null;
757
+ currentWords = [];
758
+ searchResults = [];
759
+ matchPageSet.clear();
760
+ pageCache = {};
761
+ pageLoadPromises = {};
762
+ placeholderBuilt = false;
763
+ seamlessHighlightActive = false;
764
+ currentCenterPage = null;
765
+ highlightedPages.clear();
766
+ scrollDirection = 0;
767
+ jumpGeneration = 0;
768
+
769
+ bgActive = false;
770
+ bgAccelerated = false;
771
+ bgCompleted = false;
772
+ if (bgStatusTimer) { clearInterval(bgStatusTimer); bgStatusTimer = null; }
773
+
774
+ fileInfo.textContent = '';
775
+ resultsList.innerHTML = '';
776
+ pageText.value = '';
777
+ legend.innerHTML = '<span class="dim">No words</span>';
778
+ pagesDiv.innerHTML = '';
779
+ disableZoom();
780
+ downloadOcrLink.style.display = 'none';
781
+ ocrStatusNote.style.display = 'none';
782
+ setStatus("Ready.");
783
+
784
+ if (pageObserver) {
785
+ pageObserver.disconnect();
786
+ pageObserver = null;
787
+ }
788
+ }
789
+
790
+ /* ---------- Init ---------- */
791
+ setStatus("Ready.");
static/app2.js ADDED
@@ -0,0 +1,751 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Front-end logic with buffered, on-demand highlight lifecycle + duplicate load guard + fast jump preloading */
2
+ console.log("[app] build version:", window.APP_CONFIG?.buildVersion);
3
+
4
+ const pdfInput = document.getElementById('pdfInput');
5
+ const fileInfo = document.getElementById('fileInfo');
6
+ const wordsInput = document.getElementById('wordsInput');
7
+ const searchBtn = document.getElementById('searchBtn');
8
+ const resultsList = document.getElementById('resultsList');
9
+ const pageText = document.getElementById('pageText');
10
+ const legend = document.getElementById('legend');
11
+ const pagesDiv = document.getElementById('pages');
12
+ const statusMsg = document.getElementById('statusMsg');
13
+ const zoomIn = document.getElementById('zoomIn');
14
+ const zoomOut = document.getElementById('zoomOut');
15
+ const zoomVal = document.getElementById('zoomVal');
16
+ const divider = document.getElementById('divider');
17
+ const loadAllBtn = document.getElementById('loadAllBtn');
18
+ const ocrToggle = document.getElementById('ocrToggle');
19
+ const ocrLang = document.getElementById('ocrLang');
20
+ const downloadOcrLink = document.getElementById('downloadOcrLink');
21
+ const ocrStatusNote = document.getElementById('ocrStatusNote');
22
+
23
+ const processingOverlay = document.getElementById('processingOverlay');
24
+ const processingTitle = document.getElementById('processingTitle');
25
+ const processingDetail = document.getElementById('processingDetail');
26
+ const processingHint = document.getElementById('processingHint');
27
+ const processingError = document.getElementById('processingError');
28
+ const overlayCloseBtn = document.getElementById('overlayCloseBtn');
29
+ const processingSpinner = document.getElementById('processingSpinner');
30
+
31
+ ocrToggle.checked = false;
32
+ ocrLang.value = 'eng';
33
+
34
+ let currentDoc = null;
35
+ let currentWords = [];
36
+ let searchResults = [];
37
+ let currentSelectedPage = null;
38
+ let pageCache = {};
39
+ let currentScale = 1.0;
40
+ const MIN_SCALE = 0.5;
41
+ const MAX_SCALE = 2.5;
42
+ const SCALE_STEP = 0.15;
43
+
44
+ /* Track in-flight page loads to prevent duplicates */
45
+ const pageLoadPromises = {}; // pageNum -> Promise
46
+
47
+ let matchPageSet = new Set();
48
+ let seamlessHighlightActive = false;
49
+
50
+ /* Buffered Highlight Configuration */
51
+ const HIGHLIGHT_BUFFER_BEFORE = 2;
52
+ const HIGHLIGHT_BUFFER_AFTER = 2;
53
+ const PREFETCH_EXTRA_AHEAD = 1;
54
+ let bufferedHighlightMode = true;
55
+
56
+ /* Buffered highlight state */
57
+ let currentCenterPage = null;
58
+ let highlightedPages = new Set();
59
+ let scrollDirection = 0; // -1 up, +1 down
60
+ let programmaticScrollInProgress = false;
61
+ let pageObserver = null;
62
+
63
+ /* Jump control */
64
+ let currentJumpToken = 0;
65
+
66
+ /* Loading strategy */
67
+ const LARGE_DOC_THRESHOLD = 80;
68
+ const AUTO_LOAD_PAGES_LARGE = 10;
69
+ const AUTO_LOAD_PAGES_SMALL = Infinity;
70
+
71
+ /* Overlay state */
72
+ let overlayCompletedTimestamp = null;
73
+ let overlayForceHideTimer = null;
74
+
75
+ function setStatus(msg) {
76
+ statusMsg.textContent = msg;
77
+ }
78
+
79
+ function parseWords(raw) {
80
+ return raw.trim()
81
+ .split(/[,\s;]+/)
82
+ .filter(Boolean)
83
+ .map(w => w.toLowerCase())
84
+ .filter((v,i,a)=>a.indexOf(v)===i);
85
+ }
86
+
87
+ /* Overlay helpers */
88
+ function showProcessingOverlay(title, detail, showHint=true) {
89
+ processingTitle.textContent = title;
90
+ processingDetail.textContent = detail || '';
91
+ processingHint.style.display = showHint ? 'block' : 'none';
92
+ processingError.style.display = 'none';
93
+ overlayCloseBtn.style.display = 'none';
94
+ processingSpinner.style.display = 'block';
95
+ processingOverlay.classList.remove('hidden');
96
+ overlayCompletedTimestamp = null;
97
+ if (overlayForceHideTimer) {
98
+ clearTimeout(overlayForceHideTimer);
99
+ overlayForceHideTimer = null;
100
+ }
101
+ }
102
+
103
+ function markOverlayCompleted(successMsg) {
104
+ processingTitle.textContent = 'Completed';
105
+ processingDetail.textContent = successMsg || 'Done.';
106
+ processingHint.style.display = 'none';
107
+ processingSpinner.style.display = 'none';
108
+ overlayCompletedTimestamp = performance.now();
109
+ setTimeout(() => {
110
+ if (!processingOverlay.classList.contains('hidden')) {
111
+ overlayCloseBtn.style.display = 'inline-flex';
112
+ }
113
+ }, 2500);
114
+ }
115
+
116
+ function showOverlayError(msg) {
117
+ processingError.textContent = msg;
118
+ processingError.style.display = 'block';
119
+ processingSpinner.style.display = 'none';
120
+ processingTitle.textContent = 'Error';
121
+ processingHint.style.display = 'none';
122
+ overlayCloseBtn.style.display = 'inline-flex';
123
+ }
124
+
125
+ function hideProcessingOverlay() {
126
+ processingOverlay.classList.add('hidden');
127
+ if (overlayForceHideTimer) {
128
+ clearTimeout(overlayForceHideTimer);
129
+ overlayForceHideTimer = null;
130
+ }
131
+ }
132
+
133
+ overlayCloseBtn.addEventListener('click', hideProcessingOverlay);
134
+
135
+ /* Upload */
136
+ pdfInput.addEventListener('change', async (e) => {
137
+ const f = e.target.files[0];
138
+ if (!f) return;
139
+ resetAll();
140
+
141
+ const wantsOCR = ocrToggle.checked;
142
+ showProcessingOverlay(
143
+ wantsOCR ? 'Performing OCR...' : 'Processing PDF...',
144
+ wantsOCR
145
+ ? 'Running OCR (deskew + text extraction). Please wait...'
146
+ : 'Indexing document text. Please wait...',
147
+ wantsOCR
148
+ );
149
+
150
+ setStatus("Uploading...");
151
+ const fd = new FormData();
152
+ fd.append("pdf", f);
153
+ fd.append("ocr", String(wantsOCR));
154
+ fd.append("lang", ocrLang.value.trim() || 'eng');
155
+
156
+ let json;
157
+ try {
158
+ const res = await fetch("/api/upload", { method: "POST", body: fd });
159
+ json = await res.json();
160
+ if (!res.ok) {
161
+ throw new Error(json.error || "Upload / processing failed");
162
+ }
163
+ } catch (err) {
164
+ console.error("[upload] error:", err, json);
165
+ setStatus(err.message || "Upload error");
166
+ showOverlayError((json && json.error) ? json.error : err.message);
167
+ return;
168
+ }
169
+
170
+ currentDoc = json;
171
+ fileInfo.textContent = `${json.filename} (${json.pages} pages)`;
172
+ enableZoom();
173
+ enableLoadAllIfNeeded();
174
+
175
+ if (json.ocr_performed) {
176
+ ocrStatusNote.style.display = 'block';
177
+ if (json.ocr_failed) {
178
+ ocrStatusNote.textContent = `OCR failed: ${json.ocr_message || 'Unknown error.'}`;
179
+ } else {
180
+ ocrStatusNote.textContent = json.ocr_message || 'OCR completed.';
181
+ }
182
+ } else {
183
+ ocrStatusNote.style.display = 'none';
184
+ ocrStatusNote.textContent = '';
185
+ }
186
+
187
+ if (json.ocr_performed && !json.ocr_failed && json.used_ocr_pdf) {
188
+ try {
189
+ const metaRes = await fetch(`/api/doc/${json.doc_id}/meta`);
190
+ const metaJ = await metaRes.json();
191
+ if (metaRes.ok && metaJ.download_ocr_url) {
192
+ downloadOcrLink.href = metaJ.download_ocr_url;
193
+ downloadOcrLink.style.display = 'inline-flex';
194
+ }
195
+ } catch (e) {
196
+ console.warn("[meta] fetch failed:", e);
197
+ }
198
+ } else {
199
+ downloadOcrLink.style.display = 'none';
200
+ }
201
+
202
+ markOverlayCompleted(
203
+ (json.ocr_performed && !json.ocr_failed)
204
+ ? `OCR finished in ${(json.ocr_time_seconds || 0).toFixed(1)}s. Rendering pages...`
205
+ : (json.ocr_performed && json.ocr_failed)
206
+ ? `Rendering original pages (OCR failed).`
207
+ : `Rendering pages...`
208
+ );
209
+
210
+ try {
211
+ setStatus("Rendering pages...");
212
+ await autoRenderInitialPages();
213
+ setStatus("Pages ready. Enter words & press Search.");
214
+ } catch (renderErr) {
215
+ console.error("[render] error:", renderErr);
216
+ setStatus("Render error: " + renderErr.message);
217
+ showOverlayError("Render error: " + renderErr.message);
218
+ return;
219
+ } finally {
220
+ setTimeout(hideProcessingOverlay, 400);
221
+ overlayForceHideTimer = setTimeout(() => {
222
+ if (!processingOverlay.classList.contains('hidden')) {
223
+ console.warn("[overlay] force hiding after timeout");
224
+ hideProcessingOverlay();
225
+ }
226
+ }, 15000);
227
+ }
228
+ });
229
+
230
+ /* Load All Pages */
231
+ loadAllBtn.addEventListener('click', async () => {
232
+ if (!currentDoc) return;
233
+ loadAllBtn.disabled = true;
234
+ setStatus("Loading remaining pages...");
235
+ const start = performance.now();
236
+ for (let p = 1; p <= currentDoc.pages; p++) {
237
+ await safeEnsurePage(p);
238
+ if (p % 5 === 0) setStatus(`Loading remaining pages ${p}/${currentDoc.pages}...`);
239
+ }
240
+ const dur = (performance.now() - start)/1000;
241
+ setStatus(`All pages loaded (${dur.toFixed(1)}s).`);
242
+ });
243
+
244
+ function enableLoadAllIfNeeded() {
245
+ if (!currentDoc) {
246
+ loadAllBtn.disabled = true;
247
+ return;
248
+ }
249
+ loadAllBtn.disabled = currentDoc.pages <= LARGE_DOC_THRESHOLD;
250
+ }
251
+
252
+ /* Initial pages */
253
+ async function autoRenderInitialPages() {
254
+ if (!currentDoc) return;
255
+ const total = currentDoc.pages;
256
+ const limit = (total > LARGE_DOC_THRESHOLD) ? AUTO_LOAD_PAGES_LARGE : AUTO_LOAD_PAGES_SMALL;
257
+ const toLoad = Math.min(limit, total);
258
+ for (let p = 1; p <= toLoad; p++) {
259
+ await safeEnsurePage(p);
260
+ if (p % 3 === 0 || p === toLoad) {
261
+ setStatus(`Rendering pages ${p}/${toLoad}${toLoad < total ? ' (preview)' : ''}...`);
262
+ }
263
+ }
264
+ if (toLoad < total) {
265
+ setStatus(`Preview loaded (${toLoad}/${total}). Load All Pages or search.`);
266
+ }
267
+ }
268
+
269
+ /* Preload surrounding buffer for fast jumps */
270
+ async function preloadJumpWindow(centerPage) {
271
+ const tasks = [];
272
+ const start = Math.max(1, centerPage - HIGHLIGHT_BUFFER_BEFORE);
273
+ const end = Math.min(currentDoc.pages, centerPage + HIGHLIGHT_BUFFER_AFTER);
274
+ for (let p = start; p <= end; p++) {
275
+ if (!pageCache[p]) {
276
+ tasks.push(safeEnsurePage(p));
277
+ }
278
+ }
279
+ if (tasks.length) {
280
+ await Promise.all(tasks);
281
+ }
282
+ }
283
+
284
+ async function safeEnsurePage(pageNum) {
285
+ try {
286
+ await ensurePageLoaded(pageNum);
287
+ } catch (e) {
288
+ console.error(`[page ${pageNum}] load error:`, e);
289
+ setStatus(`Page ${pageNum} load error: ${e.message}`);
290
+ }
291
+ }
292
+
293
+ /* Search */
294
+ searchBtn.addEventListener('click', runSearch);
295
+ wordsInput.addEventListener('keydown', e => {
296
+ if (e.key === 'Enter') runSearch();
297
+ });
298
+
299
+ async function runSearch() {
300
+ if (!currentDoc) {
301
+ setStatus("Upload a PDF first.");
302
+ return;
303
+ }
304
+ const raw = wordsInput.value;
305
+ const words = parseWords(raw);
306
+ currentWords = words;
307
+ updateLegend(words);
308
+
309
+ clearAllHighlights();
310
+ seamlessHighlightActive = false;
311
+ matchPageSet.clear();
312
+ highlightedPages.clear();
313
+ currentCenterPage = null;
314
+
315
+ if (!words.length) {
316
+ resultsList.innerHTML = '';
317
+ pageText.value = '';
318
+ setStatus("No words entered.");
319
+ return;
320
+ }
321
+
322
+ setStatus("Searching...");
323
+ let data;
324
+ try {
325
+ const res = await fetch(`/api/doc/${currentDoc.doc_id}/search`, {
326
+ method: "POST",
327
+ headers: {"Content-Type":"application/json"},
328
+ body: JSON.stringify({words: raw})
329
+ });
330
+ data = await res.json();
331
+ if (!res.ok) throw new Error(data.error || "Search failed");
332
+ } catch (err) {
333
+ console.error("[search] error:", err, data);
334
+ setStatus(err.message);
335
+ return;
336
+ }
337
+ searchResults = data.results || [];
338
+ populateResults();
339
+ if (!searchResults.length) {
340
+ setStatus("No pages found.");
341
+ pageText.value = '';
342
+ return;
343
+ }
344
+ matchPageSet = new Set(searchResults.map(r => r.page));
345
+
346
+ const firstPage = searchResults[0].page;
347
+ await safeEnsurePage(firstPage);
348
+ currentJumpToken++; // reset jump token context
349
+ await preloadJumpWindow(firstPage);
350
+ setCenterPage(firstPage, { fromClick:true });
351
+ seamlessHighlightActive = true;
352
+ selectResultIndex(0, {preserveHighlights:true, skipScroll:true}); // we'll scroll explicitly after window built
353
+ scrollPageIntoView(firstPage);
354
+ setStatus(`Ready. Highlight window centered at page ${firstPage}.`);
355
+ }
356
+
357
+ function updateLegend(words) {
358
+ legend.innerHTML = '';
359
+ if (!words.length) {
360
+ legend.innerHTML = '<span class="dim">No words</span>';
361
+ return;
362
+ }
363
+ const sw = document.createElement('div');
364
+ sw.className = 'swatch';
365
+ legend.appendChild(sw);
366
+ const txt = document.createElement('div');
367
+ txt.textContent = words.join(', ');
368
+ legend.appendChild(txt);
369
+ }
370
+
371
+ function populateResults() {
372
+ resultsList.innerHTML = '';
373
+ if (!searchResults.length) {
374
+ const li = document.createElement('li');
375
+ li.textContent = '[No pages]';
376
+ li.classList.add('dim');
377
+ resultsList.appendChild(li);
378
+ return;
379
+ }
380
+ searchResults.forEach((r, idx) => {
381
+ const li = document.createElement('li');
382
+ const parts = [];
383
+ currentWords.forEach(w => {
384
+ const c = r.counts[w] || 0;
385
+ if (c) parts.push(`${w}:${c}`);
386
+ });
387
+ li.innerHTML = `<span>Pg ${r.page}</span><span style="opacity:.75">${parts.join(', ')}</span>`;
388
+ li.addEventListener('click', async () => {
389
+ const token = ++currentJumpToken;
390
+ setStatus(`Jumping to page ${r.page}...`);
391
+ // Load target + its highlight window first to avoid layout shift AFTER scroll
392
+ await safeEnsurePage(r.page);
393
+ await preloadJumpWindow(r.page);
394
+ if (token !== currentJumpToken) return; // aborted by newer click
395
+ await selectResultIndex(idx, {preserveHighlights:true, skipScroll:true});
396
+ setCenterPage(r.page, { fromClick:true });
397
+ // highlight window already loaded; updateHighlightWindow will just highlight
398
+ scrollPageIntoView(r.page);
399
+ setStatus(`Centered on page ${r.page}.`);
400
+ });
401
+ resultsList.appendChild(li);
402
+ });
403
+ }
404
+
405
+ async function selectResultIndex(idx, opts = {}) {
406
+ if (idx < 0 || idx >= searchResults.length) return;
407
+ [...resultsList.children].forEach((li,i)=>li.classList.toggle('active', i===idx));
408
+ const r = searchResults[idx];
409
+ currentSelectedPage = r.page;
410
+ await safeEnsurePage(r.page);
411
+ showPageText(r.page);
412
+
413
+ if (!bufferedHighlightMode) {
414
+ if (seamlessHighlightActive) {
415
+ highlightPageMatches(r.page, {append:true});
416
+ } else if (!opts.preserveHighlights) {
417
+ clearAllHighlights();
418
+ highlightPageMatches(r.page);
419
+ }
420
+ }
421
+
422
+ if (!opts.skipScroll) {
423
+ scrollPageIntoView(r.page);
424
+ }
425
+ }
426
+
427
+ function showPageText(pageNum) {
428
+ const cache = pageCache[pageNum];
429
+ if (!cache) return;
430
+ const entry = searchResults.find(r=>r.page===pageNum);
431
+ let summary = '';
432
+ if (entry) {
433
+ const parts = currentWords
434
+ .map(w => `${w}=${entry.counts[w] || 0}`)
435
+ .filter(x => !x.endsWith('=0'));
436
+ if (parts.length) summary = 'Matches: '+parts.join(', ')+'\n'+'-'.repeat(40)+'\n';
437
+ }
438
+ pageText.value = summary + cache.text;
439
+ }
440
+
441
+ function scrollPageIntoView(pageNum) {
442
+ const el = document.querySelector(`.page[data-page="${pageNum}"]`);
443
+ if (el) {
444
+ el.scrollIntoView({behavior:'smooth', block:'start'});
445
+ }
446
+ }
447
+
448
+ /* Observer */
449
+ function ensurePageObserver() {
450
+ if (pageObserver) return;
451
+ pageObserver = new IntersectionObserver(handlePageIntersections, {
452
+ root: document.getElementById('pagesWrap'),
453
+ rootMargin: '0px',
454
+ threshold: [0.25, 0.5, 0.75]
455
+ });
456
+ }
457
+
458
+ function handlePageIntersections(entries) {
459
+ if (!bufferedHighlightMode || !entries.length) return;
460
+ if (programmaticScrollInProgress) return;
461
+
462
+ let best = null;
463
+ for (const e of entries) {
464
+ if (!e.isIntersecting) continue;
465
+ if (!best || e.intersectionRatio > best.intersectionRatio) {
466
+ best = e;
467
+ }
468
+ }
469
+ if (!best) return;
470
+ const pageNum = parseInt(best.target.dataset.page, 10);
471
+ if (currentCenterPage !== pageNum) {
472
+ if (currentCenterPage != null) {
473
+ scrollDirection = pageNum > currentCenterPage ? 1 : -1;
474
+ }
475
+ setCenterPage(pageNum);
476
+ }
477
+ }
478
+
479
+ function setCenterPage(pageNum, { fromClick=false } = {}) {
480
+ currentCenterPage = pageNum;
481
+ updateHighlightWindow();
482
+ if (fromClick) {
483
+ programmaticScrollInProgress = true;
484
+ setTimeout(() => { programmaticScrollInProgress = false; }, 800);
485
+ }
486
+ }
487
+
488
+ function updateHighlightWindow() {
489
+ if (!currentDoc || !bufferedHighlightMode) return;
490
+ if (currentCenterPage == null) return;
491
+
492
+ const start = Math.max(1, currentCenterPage - HIGHLIGHT_BUFFER_BEFORE);
493
+ const end = Math.min(currentDoc.pages, currentCenterPage + HIGHLIGHT_BUFFER_AFTER);
494
+
495
+ for (const p of Array.from(highlightedPages)) {
496
+ if (p < start || p > end) {
497
+ clearHighlightsOnPage(p);
498
+ highlightedPages.delete(p);
499
+ }
500
+ }
501
+
502
+ const activatePage = async (p) => {
503
+ if (!matchPageSet.has(p)) return;
504
+ await safeEnsurePage(p);
505
+ highlightPageMatches(p, { append:false });
506
+ highlightedPages.add(p);
507
+ };
508
+
509
+ const promises = [];
510
+ for (let p = start; p <= end; p++) {
511
+ if (matchPageSet.has(p) && !highlightedPages.has(p)) {
512
+ if (pageCache[p]) {
513
+ highlightPageMatches(p, { append:false });
514
+ highlightedPages.add(p);
515
+ } else {
516
+ promises.push(activatePage(p));
517
+ }
518
+ } else if (!pageCache[p] && matchPageSet.has(p)) {
519
+ promises.push(safeEnsurePage(p).then(()=>{
520
+ highlightPageMatches(p,{append:false});
521
+ highlightedPages.add(p);
522
+ }));
523
+ }
524
+ }
525
+
526
+ if (scrollDirection !== 0) {
527
+ const aheadStart = scrollDirection > 0 ? end + 1 : start - PREFETCH_EXTRA_AHEAD;
528
+ const aheadEnd = scrollDirection > 0
529
+ ? Math.min(currentDoc.pages, end + PREFETCH_EXTRA_AHEAD)
530
+ : Math.max(1, start - 1);
531
+ for (let p = aheadStart; scrollDirection > 0 ? p <= aheadEnd : p >= aheadEnd; p += scrollDirection > 0 ? 1 : -1) {
532
+ if (matchPageSet.has(p) && !pageCache[p]) {
533
+ promises.push(safeEnsurePage(p));
534
+ }
535
+ }
536
+ }
537
+
538
+ Promise.all(promises).catch(e=>console.warn('[buffer] window update error', e));
539
+ }
540
+
541
+ /* Duplicate prevention + dedupe logic */
542
+ function dedupePageDom(pageNum) {
543
+ const nodes = pagesDiv.querySelectorAll(`.page[data-page="${pageNum}"]`);
544
+ if (nodes.length <= 1) return;
545
+ for (let i = 0; i < nodes.length - 1; i++) nodes[i].remove();
546
+ }
547
+
548
+ async function ensurePageLoaded(pageNum) {
549
+ if (pageCache[pageNum]) return;
550
+ if (pageLoadPromises[pageNum]) return pageLoadPromises[pageNum];
551
+
552
+ pageLoadPromises[pageNum] = (async () => {
553
+ if (!currentDoc) return;
554
+ const res = await fetch(`/api/doc/${currentDoc.doc_id}/page/${pageNum}`);
555
+ const data = await res.json();
556
+ if (!res.ok) throw new Error(data.error || `Failed to load page ${pageNum}`);
557
+
558
+ const pageEl = document.createElement('div');
559
+ pageEl.className = 'page';
560
+ pageEl.dataset.page = pageNum;
561
+
562
+ const img = document.createElement('img');
563
+ img.src = data.image_url;
564
+ img.alt = `Page ${pageNum}`;
565
+ img.decoding = 'async';
566
+ img.loading = 'lazy';
567
+ pageEl.appendChild(img);
568
+
569
+ const label = document.createElement('div');
570
+ label.className = 'page-label';
571
+ label.textContent = `Page ${pageNum}`;
572
+ pageEl.appendChild(label);
573
+
574
+ const overlay = document.createElement('div');
575
+ overlay.className = 'overlay';
576
+ overlay.style.position = 'absolute';
577
+ overlay.style.inset = '0';
578
+ overlay.style.pointerEvents = 'none';
579
+ pageEl.appendChild(overlay);
580
+
581
+ insertPageInOrder(pageEl);
582
+ dedupePageDom(pageNum);
583
+
584
+ pageCache[pageNum] = {
585
+ tokens: data.tokens,
586
+ text: data.text,
587
+ imageLoadedPromise: new Promise(resolve => {
588
+ img.onload = () => resolve();
589
+ img.onerror = () => resolve();
590
+ }),
591
+ overlay
592
+ };
593
+ await pageCache[pageNum].imageLoadedPromise;
594
+
595
+ ensurePageObserver();
596
+ pageObserver.observe(pageEl);
597
+
598
+ if (bufferedHighlightMode && matchPageSet.has(pageNum)) {
599
+ const inWindow =
600
+ currentCenterPage != null &&
601
+ pageNum >= currentCenterPage - HIGHLIGHT_BUFFER_BEFORE &&
602
+ pageNum <= currentCenterPage + HIGHLIGHT_BUFFER_AFTER;
603
+ if (inWindow) {
604
+ highlightPageMatches(pageNum, { append:false });
605
+ highlightedPages.add(pageNum);
606
+ }
607
+ } else if (seamlessHighlightActive && !bufferedHighlightMode && matchPageSet.has(pageNum)) {
608
+ highlightPageMatches(pageNum, {append:true});
609
+ }
610
+ })();
611
+
612
+ try {
613
+ await pageLoadPromises[pageNum];
614
+ } finally {
615
+ delete pageLoadPromises[pageNum];
616
+ }
617
+ }
618
+
619
+ function insertPageInOrder(pageEl) {
620
+ const num = parseInt(pageEl.dataset.page,10);
621
+ const existing = [...pagesDiv.querySelectorAll('.page')];
622
+ if (!existing.length) {
623
+ pagesDiv.appendChild(pageEl);
624
+ return;
625
+ }
626
+ for (let el of existing) {
627
+ const p = parseInt(el.dataset.page,10);
628
+ if (num < p) {
629
+ pagesDiv.insertBefore(pageEl, el);
630
+ return;
631
+ }
632
+ }
633
+ pagesDiv.appendChild(pageEl);
634
+ }
635
+
636
+ /* Highlighting */
637
+ function clearAllHighlights() {
638
+ document.querySelectorAll('.hl-box').forEach(el => el.remove());
639
+ }
640
+ function clearHighlightsOnPage(pageNum) {
641
+ const pageEl = document.querySelector(`.page[data-page="${pageNum}"]`);
642
+ if (!pageEl) return;
643
+ pageEl.querySelectorAll('.hl-box').forEach(el => el.remove());
644
+ }
645
+ function highlightPageMatches(pageNum, {append=false} = {}) {
646
+ const cache = pageCache[pageNum];
647
+ if (!cache || !currentWords.length) return;
648
+ if (!append) clearHighlightsOnPage(pageNum);
649
+ const targets = new Set(currentWords);
650
+ const overlay = cache.overlay;
651
+ const frag = document.createDocumentFragment();
652
+ for (const tok of cache.tokens) {
653
+ const lt = tok.text.toLowerCase();
654
+ if (targets.has(lt)) {
655
+ const [x0,y0,x1,y1] = tok.bbox;
656
+ const box = document.createElement('div');
657
+ box.className = 'hl-box';
658
+ box.style.left = (x0 * 100) + '%';
659
+ box.style.top = (y0 * 100) + '%';
660
+ box.style.width = ((x1 - x0) * 100) + '%';
661
+ box.style.height = ((y1 - y0) * 100) + '%';
662
+ frag.appendChild(box);
663
+ }
664
+ }
665
+ overlay.appendChild(frag);
666
+ }
667
+
668
+ /* Resize (no-op) */
669
+ window.addEventListener('resize', () => {});
670
+
671
+ /* Zoom */
672
+ function enableZoom() {
673
+ zoomIn.disabled = false;
674
+ zoomOut.disabled = false;
675
+ }
676
+ function disableZoom() {
677
+ zoomIn.disabled = true;
678
+ zoomOut.disabled = true;
679
+ currentScale = 1.0;
680
+ zoomVal.textContent = '100%';
681
+ pagesDiv.style.transform = '';
682
+ }
683
+ zoomIn.addEventListener('click', ()=>applyZoom(currentScale + SCALE_STEP));
684
+ zoomOut.addEventListener('click', ()=>applyZoom(currentScale - SCALE_STEP));
685
+
686
+ function applyZoom(newScale) {
687
+ if (!currentDoc) return;
688
+ newScale = Math.min(MAX_SCALE, Math.max(MIN_SCALE, newScale));
689
+ if (Math.abs(newScale - currentScale) < 0.001) return;
690
+ currentScale = newScale;
691
+ zoomVal.textContent = Math.round(currentScale * 100) + '%';
692
+ pagesDiv.style.transformOrigin = 'top center';
693
+ pagesDiv.style.transform = `scale(${currentScale})`;
694
+ }
695
+
696
+ /* Sidebar resize */
697
+ (function enableDivider() {
698
+ let dragging = false;
699
+ divider.addEventListener('mousedown', () => {
700
+ dragging = true;
701
+ document.body.style.userSelect = 'none';
702
+ document.documentElement.style.cursor = 'col-resize';
703
+ });
704
+ window.addEventListener('mouseup', () => {
705
+ if (dragging) {
706
+ dragging = false;
707
+ document.body.style.userSelect = '';
708
+ document.documentElement.style.cursor = '';
709
+ }
710
+ });
711
+ window.addEventListener('mousemove', e => {
712
+ if (!dragging) return;
713
+ const min = 220;
714
+ const max = Math.min(window.innerWidth * 0.6, 700);
715
+ const w = Math.max(min, Math.min(max, e.clientX));
716
+ document.documentElement.style.setProperty('--sidebar-width', w + 'px');
717
+ });
718
+ })();
719
+
720
+ /* Reset */
721
+ function resetAll() {
722
+ currentDoc = null;
723
+ currentWords = [];
724
+ searchResults = [];
725
+ currentSelectedPage = null;
726
+ pageCache = {};
727
+ matchPageSet.clear();
728
+ seamlessHighlightActive = false;
729
+ highlightedPages.clear();
730
+ currentCenterPage = null;
731
+ fileInfo.textContent = '';
732
+ resultsList.innerHTML = '';
733
+ pageText.value = '';
734
+ legend.innerHTML = '<span class="dim">No words</span>';
735
+ pagesDiv.innerHTML = '';
736
+ disableZoom();
737
+ loadAllBtn.disabled = true;
738
+ pagesDiv.style.transform = '';
739
+ downloadOcrLink.style.display = 'none';
740
+ ocrStatusNote.style.display = 'none';
741
+ setStatus("Ready.");
742
+ if (pageObserver) {
743
+ pageObserver.disconnect();
744
+ pageObserver = null;
745
+ }
746
+ for (const k in pageLoadPromises) {
747
+ // Best-effort; cannot actually cancel fetch.
748
+ }
749
+ }
750
+
751
+ setStatus("Ready.");
static/app3.js ADDED
@@ -0,0 +1,831 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Front-end logic: buffered highlights + background full-page loading + robust far-page jumps */
2
+ console.log("[app] build version:", window.APP_CONFIG?.buildVersion);
3
+
4
+ /* ------------- DOM Handles ------------- */
5
+ const pdfInput = document.getElementById('pdfInput');
6
+ const fileInfo = document.getElementById('fileInfo');
7
+ const wordsInput = document.getElementById('wordsInput');
8
+ const searchBtn = document.getElementById('searchBtn');
9
+ const resultsList = document.getElementById('resultsList');
10
+ const pageText = document.getElementById('pageText');
11
+ const legend = document.getElementById('legend');
12
+ const pagesDiv = document.getElementById('pages');
13
+ const statusMsg = document.getElementById('statusMsg');
14
+ const zoomIn = document.getElementById('zoomIn');
15
+ const zoomOut = document.getElementById('zoomOut');
16
+ const zoomVal = document.getElementById('zoomVal');
17
+ const divider = document.getElementById('divider');
18
+ const loadAllBtn = document.getElementById('loadAllBtn');
19
+ const ocrToggle = document.getElementById('ocrToggle');
20
+ const ocrLang = document.getElementById('ocrLang');
21
+ const downloadOcrLink = document.getElementById('downloadOcrLink');
22
+ const ocrStatusNote = document.getElementById('ocrStatusNote');
23
+
24
+ /* Overlay */
25
+ const processingOverlay = document.getElementById('processingOverlay');
26
+ const processingTitle = document.getElementById('processingTitle');
27
+ const processingDetail = document.getElementById('processingDetail');
28
+ const processingHint = document.getElementById('processingHint');
29
+ const processingError = document.getElementById('processingError');
30
+ const overlayCloseBtn = document.getElementById('overlayCloseBtn');
31
+ const processingSpinner = document.getElementById('processingSpinner');
32
+
33
+ /* ------------- Global State ------------- */
34
+ ocrToggle.checked = false;
35
+ ocrLang.value = 'eng';
36
+
37
+ let currentDoc = null;
38
+ let currentWords = [];
39
+ let searchResults = [];
40
+ let currentSelectedPage = null;
41
+
42
+ let pageCache = {}; // pageNum -> { tokens, text, imageLoadedPromise, overlay }
43
+ const pageLoadPromises = {}; // in-flight load guards
44
+ let matchPageSet = new Set();
45
+ let seamlessHighlightActive = false;
46
+
47
+ /* Highlight buffering (optimization target) */
48
+ const HIGHLIGHT_BUFFER_BEFORE = 2;
49
+ const HIGHLIGHT_BUFFER_AFTER = 2;
50
+ const PREFETCH_EXTRA_AHEAD = 1;
51
+ let bufferedHighlightMode = true;
52
+
53
+ /* Jump & center logic */
54
+ let currentCenterPage = null;
55
+ let highlightedPages = new Set();
56
+ let scrollDirection = 0; // -1 up, +1 down
57
+ let programmaticScrollInProgress = false;
58
+ let pageObserver = null;
59
+ let jumpGeneration = 0; // increments each new jump request
60
+
61
+ /* Page loading strategy */
62
+ const LARGE_DOC_THRESHOLD = 80;
63
+ const AUTO_LOAD_PAGES_LARGE = 10;
64
+ const AUTO_LOAD_PAGES_SMALL = Infinity;
65
+
66
+ /* Background full load configuration */
67
+ const ENABLE_BACKGROUND_FULL_LOAD = true;
68
+ const BG_LOAD_CONCURRENCY = 6;
69
+ const BG_LOAD_STATUS_INTERVAL_MS = 1200;
70
+ let bgLoadActive = false;
71
+ let bgLoadedCount = 0;
72
+ let bgTotalToLoad = 0;
73
+ let bgLoadAbort = false;
74
+
75
+ /* Zoom */
76
+ let currentScale = 1.0;
77
+ const MIN_SCALE = 0.5;
78
+ const MAX_SCALE = 2.5;
79
+ const SCALE_STEP = 0.15;
80
+
81
+ /* Overlay timers */
82
+ let overlayCompletedTimestamp = null;
83
+ let overlayForceHideTimer = null;
84
+
85
+ /* ------------- Utility ------------- */
86
+ function setStatus(msg) { statusMsg.textContent = msg; }
87
+ function parseWords(raw) {
88
+ return raw.trim()
89
+ .split(/[,\s;]+/)
90
+ .filter(Boolean)
91
+ .map(w => w.toLowerCase())
92
+ .filter((v,i,a)=>a.indexOf(v)===i);
93
+ }
94
+
95
+ /* ------------- Overlay Helpers ------------- */
96
+ function showProcessingOverlay(title, detail, showHint=true) {
97
+ processingTitle.textContent = title;
98
+ processingDetail.textContent = detail || '';
99
+ processingHint.style.display = showHint ? 'block' : 'none';
100
+ processingError.style.display = 'none';
101
+ overlayCloseBtn.style.display = 'none';
102
+ processingSpinner.style.display = 'block';
103
+ processingOverlay.classList.remove('hidden');
104
+ overlayCompletedTimestamp = null;
105
+ if (overlayForceHideTimer) {
106
+ clearTimeout(overlayForceHideTimer);
107
+ overlayForceHideTimer = null;
108
+ }
109
+ }
110
+
111
+ function markOverlayCompleted(successMsg) {
112
+ processingTitle.textContent = 'Completed';
113
+ processingDetail.textContent = successMsg || 'Done.';
114
+ processingHint.style.display = 'none';
115
+ processingSpinner.style.display = 'none';
116
+ overlayCompletedTimestamp = performance.now();
117
+ setTimeout(() => {
118
+ if (!processingOverlay.classList.contains('hidden')) {
119
+ overlayCloseBtn.style.display = 'inline-flex';
120
+ }
121
+ }, 2500);
122
+ }
123
+
124
+ function showOverlayError(msg) {
125
+ processingError.textContent = msg;
126
+ processingError.style.display = 'block';
127
+ processingSpinner.style.display = 'none';
128
+ processingTitle.textContent = 'Error';
129
+ processingHint.style.display = 'none';
130
+ overlayCloseBtn.style.display = 'inline-flex';
131
+ }
132
+
133
+ function hideProcessingOverlay() {
134
+ processingOverlay.classList.add('hidden');
135
+ if (overlayForceHideTimer) {
136
+ clearTimeout(overlayForceHideTimer);
137
+ overlayForceHideTimer = null;
138
+ }
139
+ }
140
+
141
+ overlayCloseBtn.addEventListener('click', hideProcessingOverlay);
142
+
143
+ /* ------------- Upload Flow ------------- */
144
+ pdfInput.addEventListener('change', async (e) => {
145
+ const f = e.target.files[0];
146
+ if (!f) return;
147
+ resetAll();
148
+
149
+ const wantsOCR = ocrToggle.checked;
150
+ showProcessingOverlay(
151
+ wantsOCR ? 'Performing OCR...' : 'Processing PDF...',
152
+ wantsOCR
153
+ ? 'Running OCR (deskew + text extraction). Please wait...'
154
+ : 'Indexing document text. Please wait...',
155
+ wantsOCR
156
+ );
157
+
158
+ setStatus("Uploading...");
159
+ const fd = new FormData();
160
+ fd.append("pdf", f);
161
+ fd.append("ocr", String(wantsOCR));
162
+ fd.append("lang", ocrLang.value.trim() || 'eng');
163
+
164
+ let json;
165
+ try {
166
+ const res = await fetch("/api/upload", { method: "POST", body: fd });
167
+ json = await res.json();
168
+ if (!res.ok) throw new Error(json.error || "Upload / processing failed");
169
+ } catch (err) {
170
+ console.error("[upload] error:", err, json);
171
+ setStatus(err.message || "Upload error");
172
+ showOverlayError((json && json.error) ? json.error : err.message);
173
+ return;
174
+ }
175
+
176
+ currentDoc = json;
177
+ fileInfo.textContent = `${json.filename} (${json.pages} pages)`;
178
+ enableZoom();
179
+ enableLoadAllIfNeeded();
180
+
181
+ // OCR status
182
+ if (json.ocr_performed) {
183
+ ocrStatusNote.style.display = 'block';
184
+ ocrStatusNote.textContent = json.ocr_failed
185
+ ? `OCR failed: ${json.ocr_message || 'Unknown error.'}`
186
+ : (json.ocr_message || 'OCR completed.');
187
+ } else {
188
+ ocrStatusNote.style.display = 'none';
189
+ ocrStatusNote.textContent = '';
190
+ }
191
+
192
+ // OCR Download link
193
+ if (json.ocr_performed && !json.ocr_failed && json.used_ocr_pdf) {
194
+ try {
195
+ const metaRes = await fetch(`/api/doc/${json.doc_id}/meta`);
196
+ const metaJ = await metaRes.json();
197
+ if (metaRes.ok && metaJ.download_ocr_url) {
198
+ downloadOcrLink.href = metaJ.download_ocr_url;
199
+ downloadOcrLink.style.display = 'inline-flex';
200
+ }
201
+ } catch (e) {
202
+ console.warn("[meta] fetch failed:", e);
203
+ }
204
+ } else {
205
+ downloadOcrLink.style.display = 'none';
206
+ }
207
+
208
+ markOverlayCompleted(
209
+ (json.ocr_performed && !json.ocr_failed)
210
+ ? `OCR finished in ${(json.ocr_time_seconds || 0).toFixed(1)}s. Rendering preview...`
211
+ : (json.ocr_performed && json.ocr_failed)
212
+ ? `Rendering original pages (OCR failed).`
213
+ : `Rendering preview...`
214
+ );
215
+
216
+ try {
217
+ setStatus("Rendering preview pages...");
218
+ await autoRenderInitialPages();
219
+ setStatus("Preview ready. Enter words & press Search.");
220
+ } catch (renderErr) {
221
+ console.error("[render] error:", renderErr);
222
+ setStatus("Render error: " + renderErr.message);
223
+ showOverlayError("Render error: " + renderErr.message);
224
+ return;
225
+ } finally {
226
+ setTimeout(hideProcessingOverlay, 400);
227
+ overlayForceHideTimer = setTimeout(() => {
228
+ if (!processingOverlay.classList.contains('hidden')) {
229
+ console.warn("[overlay] force hiding after timeout");
230
+ hideProcessingOverlay();
231
+ }
232
+ }, 15000);
233
+ }
234
+
235
+ // Start background load of remaining pages (so every page is accessible)
236
+ if (ENABLE_BACKGROUND_FULL_LOAD) {
237
+ startBackgroundFullLoad();
238
+ }
239
+ });
240
+
241
+ /* ------------- Background Full Page Loading ------------- */
242
+ async function startBackgroundFullLoad() {
243
+ if (!currentDoc || bgLoadActive) return;
244
+ const total = currentDoc.pages;
245
+ const already = Object.keys(pageCache).length;
246
+ if (already >= total) return;
247
+ bgLoadActive = true;
248
+ bgLoadAbort = false;
249
+
250
+ const toFetch = [];
251
+ for (let p = 1; p <= total; p++) {
252
+ if (!pageCache[p]) toFetch.push(p);
253
+ }
254
+ bgTotalToLoad = toFetch.length;
255
+ bgLoadedCount = 0;
256
+ const concurrency = BG_LOAD_CONCURRENCY;
257
+ let idx = 0;
258
+
259
+ const updateStatus = () => {
260
+ if (!bgLoadActive) return;
261
+ const pct = ((bgLoadedCount / Math.max(1,bgTotalToLoad)) * 100).toFixed(1);
262
+ setStatus(`Background loading pages (${bgLoadedCount}/${bgTotalToLoad}) ${pct}%`);
263
+ };
264
+ const statusTimer = setInterval(updateStatus, BG_LOAD_STATUS_INTERVAL_MS);
265
+
266
+ async function worker() {
267
+ while (!bgLoadAbort && idx < toFetch.length) {
268
+ const myIndex = idx++;
269
+ const pageNum = toFetch[myIndex];
270
+ try {
271
+ await safeEnsurePage(pageNum);
272
+ } catch (e) {
273
+ console.warn("[bgload] error page", pageNum, e);
274
+ } finally {
275
+ bgLoadedCount++;
276
+ }
277
+ }
278
+ }
279
+
280
+ const workers = [];
281
+ for (let i=0; i<concurrency; i++) workers.push(worker());
282
+ await Promise.all(workers);
283
+ clearInterval(statusTimer);
284
+ if (!bgLoadAbort) {
285
+ setStatus("All pages loaded in background. Ready for fast navigation.");
286
+ }
287
+ bgLoadActive = false;
288
+ }
289
+
290
+ /* ------------- Load All (Manual) ------------- */
291
+ loadAllBtn.addEventListener('click', async () => {
292
+ if (!currentDoc) return;
293
+ // If background loader active, just raise priority by immediately awaiting it
294
+ if (bgLoadActive) {
295
+ setStatus("Completing background load...");
296
+ bgLoadAbort = false; // ensure not aborted
297
+ while (bgLoadActive) {
298
+ await new Promise(r=>setTimeout(r,200));
299
+ }
300
+ setStatus("All pages loaded.");
301
+ return;
302
+ }
303
+
304
+ loadAllBtn.disabled = true;
305
+ setStatus("Loading all pages...");
306
+ for (let p = 1; p <= currentDoc.pages; p++) {
307
+ await safeEnsurePage(p);
308
+ if (p % 10 === 0) setStatus(`Loading all pages ${p}/${currentDoc.pages}...`);
309
+ }
310
+ setStatus("All pages loaded.");
311
+ });
312
+
313
+ function enableLoadAllIfNeeded() {
314
+ if (!currentDoc) {
315
+ loadAllBtn.disabled = true;
316
+ return;
317
+ }
318
+ loadAllBtn.disabled = currentDoc.pages <= LARGE_DOC_THRESHOLD;
319
+ }
320
+
321
+ /* ------------- Preview Pages ------------- */
322
+ async function autoRenderInitialPages() {
323
+ if (!currentDoc) return;
324
+ const total = currentDoc.pages;
325
+ const limit = (total > LARGE_DOC_THRESHOLD) ? AUTO_LOAD_PAGES_LARGE : AUTO_LOAD_PAGES_SMALL;
326
+ const toLoad = Math.min(limit, total);
327
+ for (let p = 1; p <= toLoad; p++) {
328
+ await safeEnsurePage(p);
329
+ if (p % 3 === 0 || p === toLoad) {
330
+ setStatus(`Rendering pages ${p}/${toLoad}${toLoad < total ? ' (preview)' : ''}...`);
331
+ }
332
+ }
333
+ if (toLoad < total) {
334
+ setStatus(`Preview loaded (${toLoad}/${total}). Searching will still find all pages.`);
335
+ }
336
+ }
337
+
338
+ /* ------------- Search ------------- */
339
+ searchBtn.addEventListener('click', runSearch);
340
+ wordsInput.addEventListener('keydown', e => { if (e.key === 'Enter') runSearch(); });
341
+
342
+ async function runSearch() {
343
+ if (!currentDoc) {
344
+ setStatus("Upload a PDF first.");
345
+ return;
346
+ }
347
+ const raw = wordsInput.value;
348
+ const words = parseWords(raw);
349
+ currentWords = words;
350
+ updateLegend(words);
351
+
352
+ clearAllHighlights();
353
+ seamlessHighlightActive = false;
354
+ matchPageSet.clear();
355
+ highlightedPages.clear();
356
+ currentCenterPage = null;
357
+
358
+ if (!words.length) {
359
+ resultsList.innerHTML = '';
360
+ pageText.value = '';
361
+ setStatus("No words entered.");
362
+ return;
363
+ }
364
+
365
+ setStatus("Searching...");
366
+ let data;
367
+ try {
368
+ const res = await fetch(`/api/doc/${currentDoc.doc_id}/search`, {
369
+ method: "POST",
370
+ headers: {"Content-Type":"application/json"},
371
+ body: JSON.stringify({words: raw})
372
+ });
373
+ data = await res.json();
374
+ if (!res.ok) throw new Error(data.error || "Search failed");
375
+ } catch (err) {
376
+ console.error("[search] error:", err, data);
377
+ setStatus(err.message);
378
+ return;
379
+ }
380
+ searchResults = data.results || [];
381
+ populateResults();
382
+ if (!searchResults.length) {
383
+ setStatus("No pages found.");
384
+ pageText.value = '';
385
+ return;
386
+ }
387
+ matchPageSet = new Set(searchResults.map(r => r.page));
388
+
389
+ const firstPage = searchResults[0].page;
390
+ await safeEnsurePage(firstPage);
391
+ await preloadHighlightWindow(firstPage); // ensure buffer pages ready around first
392
+ setCenterPage(firstPage, { fromClick:true });
393
+ seamlessHighlightActive = true;
394
+ selectResultIndex(0, {preserveHighlights:true, skipScroll:true});
395
+ scrollPageIntoView(firstPage);
396
+ setStatus(`Ready. Highlight window centered at page ${firstPage}.`);
397
+ }
398
+
399
+ function updateLegend(words) {
400
+ legend.innerHTML = '';
401
+ if (!words.length) {
402
+ legend.innerHTML = '<span class="dim">No words</span>';
403
+ return;
404
+ }
405
+ const sw = document.createElement('div');
406
+ sw.className = 'swatch';
407
+ legend.appendChild(sw);
408
+ const txt = document.createElement('div');
409
+ txt.textContent = words.join(', ');
410
+ legend.appendChild(txt);
411
+ }
412
+
413
+ function populateResults() {
414
+ resultsList.innerHTML = '';
415
+ if (!searchResults.length) {
416
+ const li = document.createElement('li');
417
+ li.textContent = '[No pages]';
418
+ li.classList.add('dim');
419
+ resultsList.appendChild(li);
420
+ return;
421
+ }
422
+
423
+ searchResults.forEach((r, idx) => {
424
+ const li = document.createElement('li');
425
+ const parts = [];
426
+ currentWords.forEach(w => {
427
+ const c = r.counts[w] || 0;
428
+ if (c) parts.push(`${w}:${c}`);
429
+ });
430
+ li.innerHTML = `<span>Pg ${r.page}</span><span style="opacity:.75">${parts.join(', ')}</span>`;
431
+ li.addEventListener('click', () => jumpToResultPage(idx, r.page));
432
+ resultsList.appendChild(li);
433
+ });
434
+ }
435
+
436
+ /* Far Page Jump Logic */
437
+ async function jumpToResultPage(resultIndex, pageNum) {
438
+ if (!currentDoc) return;
439
+ jumpGeneration++;
440
+ const myGen = jumpGeneration;
441
+ setStatus(`Jumping to page ${pageNum}...`);
442
+ programmaticScrollInProgress = true;
443
+
444
+ // Load target page immediately
445
+ await safeEnsurePage(pageNum);
446
+ if (myGen !== jumpGeneration) return;
447
+
448
+ // Preload its highlight window (pages within buffer) in parallel (non-blocking for scroll)
449
+ const preloadPromise = preloadHighlightWindow(pageNum);
450
+
451
+ // Set center first so highlight window logic knows where to highlight
452
+ setCenterPage(pageNum, { fromClick:true });
453
+
454
+ // Select in list (without causing new scroll)
455
+ selectResultIndex(resultIndex, { preserveHighlights:true, skipScroll:true });
456
+
457
+ // Scroll now (image inserted already)
458
+ scrollPageIntoView(pageNum);
459
+
460
+ // Wait a bit for buffer loading but don't block forever
461
+ let timedOut = false;
462
+ const timeout = new Promise(resolve => setTimeout(()=>{ timedOut = true; resolve(); }, 3000));
463
+ await Promise.race([preloadPromise, timeout]);
464
+
465
+ if (timedOut) {
466
+ setStatus(`Page ${pageNum} ready (buffer still loading)`);
467
+ } else {
468
+ setStatus(`Centered on page ${pageNum}.`);
469
+ }
470
+
471
+ // Safety: unset programmatic scroll mode slightly later so observer can resume
472
+ setTimeout(()=>{ programmaticScrollInProgress = false; }, 600);
473
+ }
474
+
475
+ async function preloadHighlightWindow(centerPage) {
476
+ const tasks = [];
477
+ const start = Math.max(1, centerPage - HIGHLIGHT_BUFFER_BEFORE);
478
+ const end = Math.min(currentDoc.pages, centerPage + HIGHLIGHT_BUFFER_AFTER);
479
+ for (let p = start; p <= end; p++) {
480
+ if (!pageCache[p]) tasks.push(safeEnsurePage(p));
481
+ }
482
+ if (tasks.length) await Promise.all(tasks);
483
+ }
484
+
485
+ /* Select result entry */
486
+ async function selectResultIndex(idx, opts = {}) {
487
+ if (idx < 0 || idx >= searchResults.length) return;
488
+ [...resultsList.children].forEach((li,i)=>li.classList.toggle('active', i===idx));
489
+ const r = searchResults[idx];
490
+ currentSelectedPage = r.page;
491
+ await safeEnsurePage(r.page);
492
+ showPageText(r.page);
493
+
494
+ if (!bufferedHighlightMode) {
495
+ if (seamlessHighlightActive) {
496
+ highlightPageMatches(r.page, {append:true});
497
+ } else if (!opts.preserveHighlights) {
498
+ clearAllHighlights();
499
+ highlightPageMatches(r.page);
500
+ }
501
+ }
502
+
503
+ if (!opts.skipScroll) scrollPageIntoView(r.page);
504
+ }
505
+
506
+ function showPageText(pageNum) {
507
+ const cache = pageCache[pageNum];
508
+ if (!cache) return;
509
+ const entry = searchResults.find(r=>r.page===pageNum);
510
+ let summary = '';
511
+ if (entry) {
512
+ const parts = currentWords
513
+ .map(w => `${w}=${entry.counts[w] || 0}`)
514
+ .filter(x => !x.endsWith('=0'));
515
+ if (parts.length) summary = 'Matches: '+parts.join(', ')+'\n'+'-'.repeat(40)+'\n';
516
+ }
517
+ pageText.value = summary + cache.text;
518
+ }
519
+
520
+ function scrollPageIntoView(pageNum) {
521
+ const el = document.querySelector(`.page[data-page="${pageNum}"]`);
522
+ if (el) el.scrollIntoView({behavior:'smooth', block:'start'});
523
+ }
524
+
525
+ /* ------------- Intersection Observer (Center Detection) ------------- */
526
+ function ensurePageObserver() {
527
+ if (pageObserver) return;
528
+ pageObserver = new IntersectionObserver(handlePageIntersections, {
529
+ root: document.getElementById('pagesWrap'),
530
+ rootMargin: '0px',
531
+ threshold: [0.25, 0.5, 0.75]
532
+ });
533
+ }
534
+
535
+ function handlePageIntersections(entries) {
536
+ if (!bufferedHighlightMode || !entries.length) return;
537
+ if (programmaticScrollInProgress) return;
538
+
539
+ let best = null;
540
+ for (const e of entries) {
541
+ if (!e.isIntersecting) continue;
542
+ if (!best || e.intersectionRatio > best.intersectionRatio) {
543
+ best = e;
544
+ }
545
+ }
546
+ if (!best) return;
547
+ const pageNum = parseInt(best.target.dataset.page, 10);
548
+ if (currentCenterPage !== pageNum) {
549
+ if (currentCenterPage != null) {
550
+ scrollDirection = pageNum > currentCenterPage ? 1 : -1;
551
+ }
552
+ setCenterPage(pageNum);
553
+ }
554
+ }
555
+
556
+ function setCenterPage(pageNum, { fromClick=false } = {}) {
557
+ currentCenterPage = pageNum;
558
+ updateHighlightWindow();
559
+ if (fromClick) {
560
+ programmaticScrollInProgress = true;
561
+ setTimeout(() => { programmaticScrollInProgress = false; }, 800);
562
+ }
563
+ }
564
+
565
+ /* ------------- Highlight Window Update ------------- */
566
+ function updateHighlightWindow() {
567
+ if (!currentDoc || !bufferedHighlightMode) return;
568
+ if (currentCenterPage == null) return;
569
+
570
+ const start = Math.max(1, currentCenterPage - HIGHLIGHT_BUFFER_BEFORE);
571
+ const end = Math.min(currentDoc.pages, currentCenterPage + HIGHLIGHT_BUFFER_AFTER);
572
+
573
+ // Remove outside window
574
+ for (const p of Array.from(highlightedPages)) {
575
+ if (p < start || p > end) {
576
+ clearHighlightsOnPage(p);
577
+ highlightedPages.delete(p);
578
+ }
579
+ }
580
+
581
+ const promises = [];
582
+ for (let p = start; p <= end; p++) {
583
+ if (matchPageSet.has(p) && !highlightedPages.has(p)) {
584
+ if (pageCache[p]) {
585
+ highlightPageMatches(p, { append:false });
586
+ highlightedPages.add(p);
587
+ } else {
588
+ promises.push(safeEnsurePage(p).then(()=>{
589
+ if (matchPageSet.has(p)) {
590
+ highlightPageMatches(p,{append:false});
591
+ highlightedPages.add(p);
592
+ }
593
+ }));
594
+ }
595
+ }
596
+ }
597
+
598
+ // Directional prefetch (no highlight yet)
599
+ if (scrollDirection !== 0) {
600
+ const aheadStart = scrollDirection > 0 ? end + 1 : start - PREFETCH_EXTRA_AHEAD;
601
+ const aheadEnd = scrollDirection > 0
602
+ ? Math.min(currentDoc.pages, end + PREFETCH_EXTRA_AHEAD)
603
+ : Math.max(1, start - 1);
604
+ for (let p = aheadStart; scrollDirection > 0 ? p <= aheadEnd : p >= aheadEnd; p += scrollDirection > 0 ? 1 : -1) {
605
+ if (matchPageSet.has(p) && !pageCache[p]) {
606
+ promises.push(safeEnsurePage(p));
607
+ }
608
+ }
609
+ }
610
+
611
+ Promise.all(promises).catch(e=>console.warn('[buffer] window update error', e));
612
+ }
613
+
614
+ /* ------------- Page Loading (with duplicate guard) ------------- */
615
+ async function safeEnsurePage(pageNum) {
616
+ try {
617
+ await ensurePageLoaded(pageNum);
618
+ } catch (e) {
619
+ console.error(`[page ${pageNum}] load error:`, e);
620
+ setStatus(`Page ${pageNum} load error: ${e.message}`);
621
+ throw e;
622
+ }
623
+ }
624
+
625
+ function dedupePageDom(pageNum) {
626
+ const nodes = pagesDiv.querySelectorAll(`.page[data-page="${pageNum}"]`);
627
+ if (nodes.length <= 1) return;
628
+ // Keep last
629
+ for (let i = 0; i < nodes.length - 1; i++) nodes[i].remove();
630
+ }
631
+
632
+ async function ensurePageLoaded(pageNum) {
633
+ if (pageCache[pageNum]) return;
634
+ if (pageLoadPromises[pageNum]) return pageLoadPromises[pageNum];
635
+
636
+ pageLoadPromises[pageNum] = (async () => {
637
+ if (!currentDoc) return;
638
+ const res = await fetch(`/api/doc/${currentDoc.doc_id}/page/${pageNum}`);
639
+ const data = await res.json();
640
+ if (!res.ok) throw new Error(data.error || `Failed to load page ${pageNum}`);
641
+
642
+ const pageEl = document.createElement('div');
643
+ pageEl.className = 'page';
644
+ pageEl.dataset.page = pageNum;
645
+
646
+ const img = document.createElement('img');
647
+ img.src = data.image_url;
648
+ img.alt = `Page ${pageNum}`;
649
+ img.decoding = 'async';
650
+ img.loading = 'lazy';
651
+ pageEl.appendChild(img);
652
+
653
+ const label = document.createElement('div');
654
+ label.className = 'page-label';
655
+ label.textContent = `Page ${pageNum}`;
656
+ pageEl.appendChild(label);
657
+
658
+ const overlay = document.createElement('div');
659
+ overlay.className = 'overlay';
660
+ overlay.style.position = 'absolute';
661
+ overlay.style.inset = '0';
662
+ overlay.style.pointerEvents = 'none';
663
+ pageEl.appendChild(overlay);
664
+
665
+ insertPageInOrder(pageEl);
666
+ dedupePageDom(pageNum);
667
+
668
+ pageCache[pageNum] = {
669
+ tokens: data.tokens,
670
+ text: data.text,
671
+ imageLoadedPromise: new Promise(resolve => {
672
+ img.onload = () => resolve();
673
+ img.onerror = () => resolve();
674
+ }),
675
+ overlay
676
+ };
677
+
678
+ await pageCache[pageNum].imageLoadedPromise;
679
+
680
+ ensurePageObserver();
681
+ pageObserver.observe(pageEl);
682
+
683
+ if (bufferedHighlightMode && matchPageSet.has(pageNum)) {
684
+ const inWindow =
685
+ currentCenterPage != null &&
686
+ pageNum >= currentCenterPage - HIGHLIGHT_BUFFER_BEFORE &&
687
+ pageNum <= currentCenterPage + HIGHLIGHT_BUFFER_AFTER;
688
+ if (inWindow) {
689
+ highlightPageMatches(pageNum, { append:false });
690
+ highlightedPages.add(pageNum);
691
+ }
692
+ } else if (seamlessHighlightActive && !bufferedHighlightMode && matchPageSet.has(pageNum)) {
693
+ highlightPageMatches(pageNum, {append:true});
694
+ }
695
+ })();
696
+
697
+ try {
698
+ await pageLoadPromises[pageNum];
699
+ } finally {
700
+ delete pageLoadPromises[pageNum];
701
+ }
702
+ }
703
+
704
+ function insertPageInOrder(pageEl) {
705
+ const num = parseInt(pageEl.dataset.page,10);
706
+ const existing = [...pagesDiv.querySelectorAll('.page')];
707
+ if (!existing.length) {
708
+ pagesDiv.appendChild(pageEl);
709
+ return;
710
+ }
711
+ for (let el of existing) {
712
+ const p = parseInt(el.dataset.page,10);
713
+ if (num < p) {
714
+ pagesDiv.insertBefore(pageEl, el);
715
+ return;
716
+ }
717
+ }
718
+ pagesDiv.appendChild(pageEl);
719
+ }
720
+
721
+ /* ------------- Highlighting (percentage-based) ------------- */
722
+ function clearAllHighlights() {
723
+ document.querySelectorAll('.hl-box').forEach(el => el.remove());
724
+ }
725
+ function clearHighlightsOnPage(pageNum) {
726
+ const pageEl = document.querySelector(`.page[data-page="${pageNum}"]`);
727
+ if (!pageEl) return;
728
+ pageEl.querySelectorAll('.hl-box').forEach(el => el.remove());
729
+ }
730
+ function highlightPageMatches(pageNum, {append=false} = {}) {
731
+ const cache = pageCache[pageNum];
732
+ if (!cache || !currentWords.length) return;
733
+ if (!append) clearHighlightsOnPage(pageNum);
734
+ const targets = new Set(currentWords);
735
+ const overlay = cache.overlay;
736
+ const frag = document.createDocumentFragment();
737
+ for (const tok of cache.tokens) {
738
+ const lt = tok.text.toLowerCase();
739
+ if (targets.has(lt)) {
740
+ const [x0,y0,x1,y1] = tok.bbox;
741
+ const box = document.createElement('div');
742
+ box.className = 'hl-box';
743
+ box.style.left = (x0 * 100) + '%';
744
+ box.style.top = (y0 * 100) + '%';
745
+ box.style.width = ((x1 - x0) * 100) + '%';
746
+ box.style.height = ((y1 - y0) * 100) + '%';
747
+ frag.appendChild(box);
748
+ }
749
+ }
750
+ overlay.appendChild(frag);
751
+ }
752
+
753
+ /* ------------- Resize (no-op for percentage highlights) ------------- */
754
+ window.addEventListener('resize', () => {});
755
+
756
+ /* ------------- Zoom ------------- */
757
+ function enableZoom() { zoomIn.disabled = false; zoomOut.disabled = false; }
758
+ function disableZoom() {
759
+ zoomIn.disabled = true;
760
+ zoomOut.disabled = true;
761
+ currentScale = 1.0;
762
+ zoomVal.textContent = '100%';
763
+ pagesDiv.style.transform = '';
764
+ }
765
+ zoomIn.addEventListener('click', ()=>applyZoom(currentScale + SCALE_STEP));
766
+ zoomOut.addEventListener('click', ()=>applyZoom(currentScale - SCALE_STEP));
767
+ function applyZoom(newScale) {
768
+ if (!currentDoc) return;
769
+ newScale = Math.min(MAX_SCALE, Math.max(MIN_SCALE, newScale));
770
+ if (Math.abs(newScale - currentScale) < 0.001) return;
771
+ currentScale = newScale;
772
+ zoomVal.textContent = Math.round(currentScale * 100) + '%';
773
+ pagesDiv.style.transformOrigin = 'top center';
774
+ pagesDiv.style.transform = `scale(${currentScale})`;
775
+ }
776
+
777
+ /* ------------- Sidebar Resize ------------- */
778
+ (function enableDivider() {
779
+ let dragging = false;
780
+ divider.addEventListener('mousedown', () => {
781
+ dragging = true;
782
+ document.body.style.userSelect = 'none';
783
+ document.documentElement.style.cursor = 'col-resize';
784
+ });
785
+ window.addEventListener('mouseup', () => {
786
+ if (dragging) {
787
+ dragging = false;
788
+ document.body.style.userSelect = '';
789
+ document.documentElement.style.cursor = '';
790
+ }
791
+ });
792
+ window.addEventListener('mousemove', e => {
793
+ if (!dragging) return;
794
+ const min = 220;
795
+ const max = Math.min(window.innerWidth * 0.6, 700);
796
+ const w = Math.max(min, Math.min(max, e.clientX));
797
+ document.documentElement.style.setProperty('--sidebar-width', w + 'px');
798
+ });
799
+ })();
800
+
801
+ /* ------------- Reset ------------- */
802
+ function resetAll() {
803
+ currentDoc = null;
804
+ currentWords = [];
805
+ searchResults = [];
806
+ currentSelectedPage = null;
807
+ pageCache = {};
808
+ matchPageSet.clear();
809
+ seamlessHighlightActive = false;
810
+ highlightedPages.clear();
811
+ currentCenterPage = null;
812
+ fileInfo.textContent = '';
813
+ resultsList.innerHTML = '';
814
+ pageText.value = '';
815
+ legend.innerHTML = '<span class="dim">No words</span>';
816
+ pagesDiv.innerHTML = '';
817
+ disableZoom();
818
+ loadAllBtn.disabled = true;
819
+ pagesDiv.style.transform = '';
820
+ downloadOcrLink.style.display = 'none';
821
+ ocrStatusNote.style.display = 'none';
822
+ setStatus("Ready.");
823
+ if (pageObserver) {
824
+ pageObserver.disconnect();
825
+ pageObserver = null;
826
+ }
827
+ bgLoadAbort = true;
828
+ bgLoadActive = false;
829
+ }
830
+
831
+ setStatus("Ready.");
static/style.css ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --accent: #ffa800;
3
+ --accent-alt: #ffb94d;
4
+ --bg: #1e1f24;
5
+ --bg-alt: #272a33;
6
+ --bg-soft: #303542;
7
+ --border: #3c4250;
8
+ --text: #e5e8ef;
9
+ --text-dim: #b2b8c6;
10
+ --radius: 8px;
11
+ --sidebar-width: 360px;
12
+ --page-aspect-w: 612;
13
+ --page-aspect-h: 792;
14
+ }
15
+ * { box-sizing: border-box; }
16
+ html, body {
17
+ margin:0; height:100%;
18
+ font-family: system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,sans-serif;
19
+ background:var(--bg); color:var(--text);
20
+ }
21
+ body { display:flex; flex-direction:column; overflow:hidden; }
22
+ header {
23
+ background:linear-gradient(90deg,var(--bg-alt),var(--bg-soft));
24
+ padding:10px 16px;
25
+ display:flex; flex-direction:column; gap:10px;
26
+ box-shadow:0 4px 10px -2px rgba(0,0,0,.4);
27
+ }
28
+ header h1 { margin:0; font-size:18px; letter-spacing:.5px; }
29
+ .controls { display:flex; flex-wrap:wrap; gap:12px; align-items:center; }
30
+ .controls .block { display:flex; align-items:center; gap:8px; }
31
+ .controls .grow { flex:1; min-width:220px; }
32
+
33
+ input[type=text] {
34
+ width:100%; padding:8px 10px;
35
+ border:1px solid var(--border); border-radius:6px;
36
+ background:var(--bg-soft); color:var(--text);
37
+ font-size:14px; outline:none;
38
+ }
39
+ input[type=text]:focus { border-color:var(--accent); box-shadow:0 0 0 1px var(--accent); }
40
+
41
+ .btn,.btn-secondary,button {
42
+ cursor:pointer; border:none; border-radius:6px;
43
+ padding:8px 14px; font-size:14px;
44
+ display:inline-flex; align-items:center;
45
+ font-weight:600;
46
+ }
47
+ .btn,button.primary { background:var(--accent); color:#222; }
48
+ .btn:hover,button.primary:hover { background:var(--accent-alt); }
49
+ .btn-secondary {
50
+ background:#444; color:#eee; font-weight:500;
51
+ box-shadow:0 2px 6px -1px rgba(0,0,0,.5);
52
+ text-decoration:none;
53
+ }
54
+ .btn-secondary:hover { background:#555; }
55
+ button {
56
+ background:var(--bg-soft); color:var(--text); font-weight:500;
57
+ border:1px solid var(--border);
58
+ }
59
+ button:hover { background:#3b414f; }
60
+ button:disabled { opacity:.5; cursor:not-allowed; }
61
+
62
+ #layout { flex:1; display:grid; grid-template-columns: var(--sidebar-width) 6px 1fr; min-height:0; }
63
+ #sidebar {
64
+ overflow:auto; padding:14px;
65
+ display:flex; flex-direction:column; gap:20px;
66
+ background:var(--bg-alt);
67
+ }
68
+ #divider {
69
+ background:linear-gradient(180deg,var(--bg-alt),var(--bg-soft));
70
+ cursor:col-resize; position:relative;
71
+ }
72
+ #divider:after {
73
+ content:""; position:absolute; left:50%; top:50%;
74
+ width:4px; height:42px; transform:translate(-50%,-50%);
75
+ background:var(--border); border-radius:2px;
76
+ }
77
+
78
+ section h2 {
79
+ margin:0 0 6px; font-size:13px;
80
+ text-transform:uppercase; letter-spacing:1px;
81
+ font-weight:600; color:var(--text-dim);
82
+ }
83
+
84
+ .legend {
85
+ background:var(--bg-soft); padding:8px 10px;
86
+ border:1px solid var(--border); border-radius:6px;
87
+ min-height:42px; font-size:13px;
88
+ display:flex; flex-wrap:wrap; gap:6px; align-items:center;
89
+ }
90
+ .swatch {
91
+ width:24px; height:14px; background:var(--accent);
92
+ border:1px solid #0006; border-radius:3px;
93
+ box-shadow:0 0 0 1px #0004;
94
+ }
95
+
96
+ .results {
97
+ list-style:none; margin:0; padding:0;
98
+ border:1px solid var(--border); border-radius:6px;
99
+ background:var(--bg-soft); max-height:250px; overflow:auto;
100
+ }
101
+ .results li {
102
+ padding:6px 10px; font-size:13px;
103
+ display:flex; justify-content:space-between; gap:10px;
104
+ border-bottom:1px solid #ffffff08;
105
+ cursor:pointer; transition:background .15s;
106
+ }
107
+ .results li:last-child { border-bottom:none; }
108
+ .results li:hover { background:#ffffff08; }
109
+ .results li.active { background:var(--accent); color:#222; font-weight:600; }
110
+
111
+ .note { font-size:11.5px; color:var(--text-dim); margin:6px 0 0; line-height:1.4; }
112
+
113
+ textarea#pageText {
114
+ width:100%; min-height:200px; resize:vertical;
115
+ background:var(--bg-soft); color:var(--text);
116
+ border:1px solid var(--border); border-radius:6px;
117
+ padding:10px 12px; font-size:12.5px;
118
+ line-height:1.4;
119
+ font-family:ui-monospace,SFMono-Regular,Consolas,"Roboto Mono",monospace;
120
+ }
121
+
122
+ #pagesWrap { overflow:auto; background:#16171b; }
123
+ #pages {
124
+ padding:24px clamp(16px,4vw,60px);
125
+ display:flex; flex-direction:column;
126
+ gap:36px; align-items:center;
127
+ }
128
+
129
+ .page {
130
+ position:relative;
131
+ width:100%;
132
+ max-width:1200px;
133
+ background:#000;
134
+ border-radius:4px;
135
+ box-shadow:0 4px 24px -4px #000a,0 0 0 1px #000;
136
+ overflow:hidden;
137
+ aspect-ratio: var(--page-aspect-w) / var(--page-aspect-h);
138
+ }
139
+
140
+ .page.placeholder {
141
+ display:flex;
142
+ align-items:center;
143
+ justify-content:center;
144
+ background:linear-gradient(135deg,#222 0%,#2d3038 60%);
145
+ color:#666;
146
+ font-size:12px;
147
+ }
148
+
149
+ .page.placeholder .page-inner {
150
+ display:flex;
151
+ flex-direction:column;
152
+ align-items:center;
153
+ gap:10px;
154
+ }
155
+
156
+ .placeholder-label {
157
+ font-weight:600;
158
+ letter-spacing:.5px;
159
+ opacity:.85;
160
+ }
161
+
162
+ .placeholder-spinner {
163
+ width:42px; height:42px;
164
+ border:5px solid #ffffff14;
165
+ border-top:5px solid var(--accent);
166
+ border-right:5px solid var(--accent);
167
+ border-radius:50%;
168
+ animation:spin 0.9s linear infinite;
169
+ box-shadow:0 0 6px -1px #000;
170
+ }
171
+
172
+ .page img { display:block; width:100%; height:100%; object-fit:cover; }
173
+
174
+ .page-label {
175
+ position:absolute; top:0; left:0;
176
+ padding:4px 8px; margin:6px;
177
+ background:rgba(0,0,0,.55);
178
+ color:#fff; font-size:12px;
179
+ letter-spacing:.5px; border-radius:4px;
180
+ font-weight:600; pointer-events:none;
181
+ }
182
+
183
+ .hl-box {
184
+ position:absolute;
185
+ border:2px solid var(--accent);
186
+ background:rgba(255,168,0,0.28);
187
+ border-radius:2px;
188
+ box-shadow:0 0 0 1px #0005;
189
+ pointer-events:none;
190
+ }
191
+
192
+ #statusBar {
193
+ position:fixed; bottom:10px; left:12px;
194
+ background:rgba(15,16,20,.85);
195
+ backdrop-filter:blur(4px);
196
+ color:var(--text); font-size:12.5px;
197
+ padding:6px 12px;
198
+ border-radius:32px;
199
+ border:1px solid var(--border);
200
+ box-shadow:0 4px 12px -2px rgba(0,0,0,.55);
201
+ z-index:100;
202
+ }
203
+
204
+ #processingOverlay {
205
+ position:fixed; inset:0;
206
+ background:rgba(12,14,18,0.94);
207
+ backdrop-filter:blur(6px);
208
+ display:flex; align-items:center; justify-content:center;
209
+ flex-direction:column;
210
+ z-index:400;
211
+ padding:40px 30px;
212
+ text-align:center;
213
+ }
214
+ #processingOverlay.hidden { display:none; }
215
+ #processingOverlay .overlay-content {
216
+ max-width:520px;
217
+ display:flex; flex-direction:column;
218
+ align-items:center; gap:18px;
219
+ }
220
+ .spinner {
221
+ width:60px; height:60px;
222
+ border:6px solid #ffffff18;
223
+ border-top:6px solid var(--accent);
224
+ border-right:6px solid var(--accent);
225
+ border-radius:50%;
226
+ animation:spin 0.9s linear infinite;
227
+ }
228
+ @keyframes spin { to { transform:rotate(360deg);} }
229
+
230
+ .detail { font-size:14px; color:var(--text-dim); margin:0; line-height:1.5; }
231
+ .hint { font-size:12px; color:var(--text-dim); margin:0; }
232
+
233
+ .dim { opacity:.6; }
234
+
235
+ @media (max-width:1100px) {
236
+ #layout { grid-template-columns: minmax(0,1fr); grid-auto-rows:auto; }
237
+ #divider { display:none; }
238
+ #sidebar { order:1; max-height:42vh; }
239
+ #pagesWrap { order:2; }
240
+ }
templates/index.html ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>PDF Exact Word Page Finder (OCR)</title>
6
+ <meta name="viewport" content="width=device-width,initial-scale=1">
7
+ <link rel="stylesheet" href="/static/style.css">
8
+ <script>
9
+ window.APP_CONFIG = {
10
+ highlightColor: "{{ highlight_color }}",
11
+ buildVersion: "buffered-highlights-2"
12
+ };
13
+ </script>
14
+ <style>
15
+ #processingOverlay .close-btn {
16
+ display:none;
17
+ margin-top:4px;
18
+ background:#444;
19
+ color:#eee;
20
+ padding:6px 14px;
21
+ border-radius:20px;
22
+ font-size:13px;
23
+ cursor:pointer;
24
+ border:1px solid #666;
25
+ }
26
+ #processingOverlay .close-btn:hover { background:#555; }
27
+ #processingOverlay .error-line {
28
+ font-size:12px;
29
+ color:#ff6666;
30
+ margin:0;
31
+ display:none;
32
+ max-width:480px;
33
+ word-break:break-word;
34
+ }
35
+ </style>
36
+ </head>
37
+ <body>
38
+ <header>
39
+ <h1>PDF Exact Word Page Finder</h1>
40
+ <div class="controls">
41
+ <div class="block">
42
+ <label class="btn">
43
+ <input type="file" id="pdfInput" accept="application/pdf" hidden>
44
+ <span>Upload PDF</span>
45
+ </label>
46
+ <span id="fileInfo" class="dim"></span>
47
+ </div>
48
+
49
+ <div class="block">
50
+ <label style="display:flex;align-items:center;gap:4px;font-size:12px;">
51
+ <input type="checkbox" id="ocrToggle">
52
+ OCR
53
+ </label>
54
+ <input type="text" id="ocrLang" placeholder="eng" style="width:80px;">
55
+ </div>
56
+
57
+ <div class="block grow">
58
+ <input type="text" id="wordsInput" placeholder="Enter words (space / comma separated)" autocomplete="off">
59
+ </div>
60
+
61
+ <button id="searchBtn" class="primary">Search</button>
62
+
63
+ <div class="block">
64
+ <button id="zoomOut" disabled>-</button>
65
+ <span id="zoomVal">100%</span>
66
+ <button id="zoomIn" disabled>+</button>
67
+ </div>
68
+
69
+ <div class="block">
70
+ <button id="loadAllBtn" disabled title="Load any remaining pages">Load All Pages</button>
71
+ </div>
72
+
73
+ <div class="block">
74
+ <a id="downloadOcrLink" class="btn btn-secondary" style="display:none;">Download OCR PDF</a>
75
+ </div>
76
+ </div>
77
+ </header>
78
+
79
+ <main id="layout">
80
+ <aside id="sidebar">
81
+ <section>
82
+ <h2>Legend</h2>
83
+ <div id="legend" class="legend"><span class="dim">No words</span></div>
84
+ </section>
85
+ <section>
86
+ <h2>Pages with Matches</h2>
87
+ <ul id="resultsList" class="results"></ul>
88
+ <p class="note">Case-insensitive whole-token matching.</p>
89
+ <p class="note" id="ocrStatusNote" style="display:none;"></p>
90
+ </section>
91
+ <section>
92
+ <h2>Page Text</h2>
93
+ <textarea id="pageText" placeholder="Select a page result"></textarea>
94
+ </section>
95
+ </aside>
96
+ <div id="divider" title="Drag to resize sidebar"></div>
97
+ <section id="pagesWrap">
98
+ <div id="pages"></div>
99
+ </section>
100
+ </main>
101
+
102
+ <div id="statusBar"><span id="statusMsg">Ready.</span></div>
103
+
104
+ <div id="processingOverlay" class="hidden" aria-live="polite">
105
+ <div class="overlay-content">
106
+ <div class="spinner" id="processingSpinner"></div>
107
+ <h2 id="processingTitle">Processing PDF...</h2>
108
+ <p id="processingDetail" class="detail"></p>
109
+ <p class="hint" id="processingHint">
110
+ Do not close this tab. OCR can take time on large or scanned PDFs.
111
+ </p>
112
+ <p class="error-line" id="processingError"></p>
113
+ <button type="button" id="overlayCloseBtn" class="close-btn">Close</button>
114
+ </div>
115
+ </div>
116
+
117
+ <script src="/static/app.js?v=buffered-highlights-2"></script>
118
+ </body>
119
+ </html>