MikeTrizna commited on
Commit
5b47a5c
·
1 Parent(s): 9efc1d1

Initial load of web page and supporting files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdf filter=lfs diff=lfs merge=lfs -text
data.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "00001",
4
+ "newspaper_title": "The Delaware gazette",
5
+ "publication_date": "1809-07-08",
6
+ "page": "1",
7
+ "loc_record_url": "https://www.loc.gov/resource/sn82014385/1809-07-08/ed-1/?sp=1",
8
+ "pdf_path": "pdfs/00001.pdf",
9
+ "textract_ocr_file": "textract_ocr/00001_layout_extracted.txt",
10
+ "loc_altoxml_path": "loc_ocr/00001.xml"
11
+ }
12
+ ]
index.html CHANGED
@@ -1,19 +1,735 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Historical Newspaper OCR Viewer</title>
7
+
8
+ <!-- PDF.js Library -->
9
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
10
+
11
+ <style>
12
+ * {
13
+ margin: 0;
14
+ padding: 0;
15
+ box-sizing: border-box;
16
+ }
17
+
18
+ body {
19
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
20
+ height: 100vh;
21
+ overflow: hidden;
22
+ background-color: #f5f5f5;
23
+ }
24
+
25
+ .header {
26
+ background-color: #2c3e50;
27
+ color: white;
28
+ padding: 15px 20px;
29
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
30
+ }
31
+
32
+ .header h1 {
33
+ font-size: 24px;
34
+ margin-bottom: 10px;
35
+ }
36
+
37
+ .controls {
38
+ display: flex;
39
+ gap: 15px;
40
+ align-items: center;
41
+ flex-wrap: wrap;
42
+ }
43
+
44
+ .select-container {
45
+ flex: 1;
46
+ max-width: 800px;
47
+ }
48
+
49
+ .select-container select {
50
+ width: 100%;
51
+ padding: 8px 12px;
52
+ border: 1px solid #ddd;
53
+ border-radius: 4px;
54
+ font-size: 14px;
55
+ background-color: white;
56
+ }
57
+
58
+ .metadata {
59
+ background-color: #ecf0f1;
60
+ padding: 15px 20px;
61
+ border-bottom: 1px solid #bdc3c7;
62
+ display: none;
63
+ }
64
+
65
+ .metadata.active {
66
+ display: block;
67
+ }
68
+
69
+ .metadata-content {
70
+ display: flex;
71
+ gap: 20px;
72
+ align-items: center;
73
+ flex-wrap: wrap;
74
+ }
75
+
76
+ .metadata-item {
77
+ font-size: 14px;
78
+ }
79
+
80
+ .metadata-item strong {
81
+ color: #2c3e50;
82
+ }
83
+
84
+ .metadata-item a {
85
+ color: #3498db;
86
+ text-decoration: none;
87
+ }
88
+
89
+ .metadata-item a:hover {
90
+ text-decoration: underline;
91
+ }
92
+
93
+ .main-container {
94
+ display: flex;
95
+ height: calc(100vh - 180px);
96
+ background-color: white;
97
+ }
98
+
99
+ .pdf-panel, .text-panel {
100
+ width: 50%;
101
+ display: flex;
102
+ flex-direction: column;
103
+ border-right: 1px solid #ddd;
104
+ }
105
+
106
+ .text-panel {
107
+ border-right: none;
108
+ }
109
+
110
+ .panel-header {
111
+ background-color: #34495e;
112
+ color: white;
113
+ padding: 12px 20px;
114
+ font-weight: 600;
115
+ font-size: 14px;
116
+ display: flex;
117
+ justify-content: space-between;
118
+ align-items: center;
119
+ }
120
+
121
+ .pdf-controls {
122
+ display: flex;
123
+ gap: 10px;
124
+ align-items: center;
125
+ }
126
+
127
+ .zoom-btn {
128
+ background-color: #2c3e50;
129
+ color: white;
130
+ border: none;
131
+ padding: 5px 12px;
132
+ border-radius: 4px;
133
+ cursor: pointer;
134
+ font-size: 16px;
135
+ font-weight: bold;
136
+ transition: background-color 0.2s;
137
+ min-width: 32px;
138
+ }
139
+
140
+ .zoom-btn:hover:not(:disabled) {
141
+ background-color: #1a252f;
142
+ }
143
+
144
+ .zoom-btn:disabled {
145
+ background-color: #95a5a6;
146
+ cursor: not-allowed;
147
+ opacity: 0.5;
148
+ }
149
+
150
+ .zoom-level {
151
+ font-size: 13px;
152
+ min-width: 50px;
153
+ text-align: center;
154
+ }
155
+
156
+ .ocr-controls {
157
+ display: flex;
158
+ gap: 15px;
159
+ align-items: center;
160
+ }
161
+
162
+ .ocr-toggle {
163
+ display: flex;
164
+ gap: 10px;
165
+ background-color: #2c3e50;
166
+ padding: 5px;
167
+ border-radius: 4px;
168
+ }
169
+
170
+ .ocr-toggle label {
171
+ padding: 5px 12px;
172
+ cursor: pointer;
173
+ border-radius: 3px;
174
+ font-size: 13px;
175
+ transition: background-color 0.2s;
176
+ }
177
+
178
+ .ocr-toggle input[type="radio"] {
179
+ display: none;
180
+ }
181
+
182
+ .ocr-toggle input[type="radio"]:checked + span {
183
+ background-color: #3498db;
184
+ }
185
+
186
+ .ocr-toggle span {
187
+ padding: 5px 12px;
188
+ border-radius: 3px;
189
+ font-size: 13px;
190
+ }
191
+
192
+ .export-btn {
193
+ background-color: #27ae60;
194
+ color: white;
195
+ border: none;
196
+ padding: 6px 16px;
197
+ border-radius: 4px;
198
+ cursor: pointer;
199
+ font-size: 13px;
200
+ font-weight: 600;
201
+ transition: background-color 0.2s;
202
+ }
203
+
204
+ .export-btn:hover {
205
+ background-color: #229954;
206
+ }
207
+
208
+ .export-btn:disabled {
209
+ background-color: #95a5a6;
210
+ cursor: not-allowed;
211
+ }
212
+
213
+ .panel-content {
214
+ flex: 1;
215
+ overflow: auto;
216
+ position: relative;
217
+ }
218
+
219
+ .panel-content.grabbable {
220
+ cursor: grab;
221
+ }
222
+
223
+ .panel-content.grabbing {
224
+ cursor: grabbing;
225
+ }
226
+
227
+ #pdf-canvas {
228
+ display: block;
229
+ margin: 20px auto;
230
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
231
+ }
232
+
233
+ #ocr-text {
234
+ width: 100%;
235
+ height: 100%;
236
+ border: none;
237
+ padding: 20px;
238
+ font-family: 'Courier New', monospace;
239
+ font-size: 14px;
240
+ line-height: 1.6;
241
+ resize: none;
242
+ outline: none;
243
+ }
244
+
245
+ .loading {
246
+ position: absolute;
247
+ top: 50%;
248
+ left: 50%;
249
+ transform: translate(-50%, -50%);
250
+ text-align: center;
251
+ color: #7f8c8d;
252
+ }
253
+
254
+ .loading-spinner {
255
+ border: 4px solid #ecf0f1;
256
+ border-top: 4px solid #3498db;
257
+ border-radius: 50%;
258
+ width: 40px;
259
+ height: 40px;
260
+ animation: spin 1s linear infinite;
261
+ margin: 0 auto 10px;
262
+ }
263
+
264
+ @keyframes spin {
265
+ 0% { transform: rotate(0deg); }
266
+ 100% { transform: rotate(360deg); }
267
+ }
268
+
269
+ .error-message {
270
+ color: #e74c3c;
271
+ padding: 20px;
272
+ text-align: center;
273
+ }
274
+
275
+ .empty-state {
276
+ text-align: center;
277
+ color: #95a5a6;
278
+ padding: 40px;
279
+ font-size: 16px;
280
+ }
281
+
282
+ .empty-state-icon {
283
+ font-size: 48px;
284
+ margin-bottom: 10px;
285
+ }
286
+ </style>
287
+ </head>
288
+ <body>
289
+ <div class="header">
290
+ <h1>Historical Newspaper OCR Viewer</h1>
291
+ <div class="controls">
292
+ <div class="select-container">
293
+ <select id="record-select" disabled>
294
+ <option value="">Loading records...</option>
295
+ </select>
296
+ </div>
297
+ </div>
298
+ </div>
299
+
300
+ <div class="metadata" id="metadata">
301
+ <div class="metadata-content">
302
+ <div class="metadata-item">
303
+ <strong>Title:</strong> <span id="meta-title">-</span>
304
+ </div>
305
+ <div class="metadata-item">
306
+ <strong>Date:</strong> <span id="meta-date">-</span>
307
+ </div>
308
+ <div class="metadata-item">
309
+ <strong>Page:</strong> <span id="meta-page">-</span>
310
+ </div>
311
+ <div class="metadata-item">
312
+ <a id="meta-link" href="#" target="_blank">View on Library of Congress →</a>
313
+ </div>
314
+ </div>
315
+ </div>
316
+
317
+ <div class="main-container">
318
+ <div class="pdf-panel">
319
+ <div class="panel-header">
320
+ <span>PDF Viewer</span>
321
+ <div class="pdf-controls">
322
+ <button class="zoom-btn" id="zoom-out-btn" disabled title="Zoom Out">−</button>
323
+ <span class="zoom-level" id="zoom-level">100%</span>
324
+ <button class="zoom-btn" id="zoom-in-btn" disabled title="Zoom In">+</button>
325
+ <button class="zoom-btn" id="zoom-reset-btn" disabled title="Reset Zoom">Reset</button>
326
+ </div>
327
+ </div>
328
+ <div class="panel-content" id="pdf-container">
329
+ <div class="empty-state">
330
+ <div class="empty-state-icon">📄</div>
331
+ <div>Select a record to view the PDF</div>
332
+ </div>
333
+ </div>
334
+ </div>
335
+
336
+ <div class="text-panel">
337
+ <div class="panel-header">
338
+ <span>OCR Text</span>
339
+ <div class="ocr-controls">
340
+ <div class="ocr-toggle">
341
+ <label>
342
+ <input type="radio" name="ocr-source" value="textract" checked>
343
+ <span>Textract OCR</span>
344
+ </label>
345
+ <label>
346
+ <input type="radio" name="ocr-source" value="loc">
347
+ <span>Library of Congress OCR</span>
348
+ </label>
349
+ </div>
350
+ <button class="export-btn" id="export-btn" disabled>Export Text</button>
351
+ </div>
352
+ </div>
353
+ <div class="panel-content">
354
+ <textarea
355
+ id="ocr-text"
356
+ placeholder="Select a record and OCR source to view text..."
357
+ disabled
358
+ ></textarea>
359
+ </div>
360
+ </div>
361
+ </div>
362
+
363
+ <script>
364
+ // Configure PDF.js worker
365
+ pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
366
+
367
+ // Application state
368
+ const state = {
369
+ records: [],
370
+ currentRecord: null,
371
+ currentOcrSource: 'textract',
372
+ originalOcrText: '',
373
+ hasEdits: false,
374
+ isLoading: false,
375
+ currentPdf: null,
376
+ zoomLevel: 1.0
377
+ };
378
+
379
+ // DOM elements
380
+ const recordSelect = document.getElementById('record-select');
381
+ const ocrText = document.getElementById('ocr-text');
382
+ const exportBtn = document.getElementById('export-btn');
383
+ const pdfContainer = document.getElementById('pdf-container');
384
+ const metadata = document.getElementById('metadata');
385
+ const ocrRadios = document.querySelectorAll('input[name="ocr-source"]');
386
+ const zoomInBtn = document.getElementById('zoom-in-btn');
387
+ const zoomOutBtn = document.getElementById('zoom-out-btn');
388
+ const zoomResetBtn = document.getElementById('zoom-reset-btn');
389
+ const zoomLevel = document.getElementById('zoom-level');
390
+
391
+ // Initialize application
392
+ async function init() {
393
+ try {
394
+ const response = await fetch('data.json');
395
+ if (!response.ok) {
396
+ throw new Error('Failed to load data.json');
397
+ }
398
+ state.records = await response.json();
399
+
400
+ populateRecordSelect();
401
+ enableControls();
402
+ setupEventListeners();
403
+ } catch (error) {
404
+ console.error('Initialization error:', error);
405
+ recordSelect.innerHTML = '<option value="">Error loading records</option>';
406
+ alert('Failed to load data.json. Please ensure the file exists in the same directory as index.html.');
407
+ }
408
+ }
409
+
410
+ function enableControls() {
411
+ recordSelect.disabled = false;
412
+ }
413
+
414
+ function populateRecordSelect() {
415
+ recordSelect.innerHTML = '<option value="">Select a record...</option>';
416
+
417
+ state.records.forEach(record => {
418
+ const option = document.createElement('option');
419
+ option.value = record.id;
420
+ option.textContent = `${record.id} - ${record.newspaper_title} ${record.publication_date} Page ${record.page}`;
421
+ recordSelect.appendChild(option);
422
+ });
423
+ }
424
+
425
+ function setupEventListeners() {
426
+ // Record selection
427
+ recordSelect.addEventListener('change', (e) => {
428
+ const recordId = e.target.value;
429
+ if (recordId) {
430
+ const record = state.records.find(r => r.id === recordId);
431
+ loadRecord(record);
432
+ }
433
+ });
434
+
435
+ // OCR source toggle
436
+ ocrRadios.forEach(radio => {
437
+ radio.addEventListener('change', (e) => {
438
+ if (state.hasEdits) {
439
+ const confirmed = confirm('You have unsaved edits. Switching OCR source will discard your changes. Continue?');
440
+ if (!confirmed) {
441
+ // Revert to previous selection
442
+ document.querySelector(`input[name="ocr-source"][value="${state.currentOcrSource}"]`).checked = true;
443
+ return;
444
+ }
445
+ }
446
+ state.currentOcrSource = e.target.value;
447
+ if (state.currentRecord) {
448
+ loadOcrText(state.currentRecord);
449
+ }
450
+ });
451
+ });
452
+
453
+ // Track edits
454
+ ocrText.addEventListener('input', () => {
455
+ state.hasEdits = ocrText.value !== state.originalOcrText;
456
+ });
457
+
458
+ // Export button
459
+ exportBtn.addEventListener('click', exportText);
460
+
461
+ // Zoom controls
462
+ zoomInBtn.addEventListener('click', () => zoomPdf(0.25));
463
+ zoomOutBtn.addEventListener('click', () => zoomPdf(-0.25));
464
+ zoomResetBtn.addEventListener('click', () => resetZoom());
465
+
466
+ // Pan functionality
467
+ setupPanControls();
468
+ }
469
+
470
+ async function loadRecord(record) {
471
+ state.currentRecord = record;
472
+ state.hasEdits = false;
473
+
474
+ // Update metadata
475
+ metadata.classList.add('active');
476
+ document.getElementById('meta-title').textContent = record.newspaper_title;
477
+ document.getElementById('meta-date').textContent = record.publication_date;
478
+ document.getElementById('meta-page').textContent = record.page;
479
+ document.getElementById('meta-link').href = record.loc_record_url;
480
+
481
+ // Load PDF and OCR text in parallel
482
+ await Promise.all([
483
+ loadPdf(record.pdf_path),
484
+ loadOcrText(record)
485
+ ]);
486
+ }
487
+
488
+ async function loadPdf(pdfPath) {
489
+ pdfContainer.innerHTML = '<div class="loading"><div class="loading-spinner"></div><div>Loading PDF...</div></div>';
490
+
491
+ try {
492
+ const loadingTask = pdfjsLib.getDocument(pdfPath);
493
+ const pdf = await loadingTask.promise;
494
+ state.currentPdf = pdf;
495
+ state.zoomLevel = 1.0;
496
+
497
+ await renderPdf();
498
+
499
+ // Enable zoom controls
500
+ zoomInBtn.disabled = false;
501
+ zoomOutBtn.disabled = false;
502
+ zoomResetBtn.disabled = false;
503
+
504
+ // Enable pan controls
505
+ pdfContainer.classList.add('grabbable');
506
+ } catch (error) {
507
+ console.error('PDF loading error:', error);
508
+ pdfContainer.innerHTML = '<div class="error-message">Failed to load PDF: ' + error.message + '</div>';
509
+ }
510
+ }
511
+
512
+ async function renderPdf() {
513
+ if (!state.currentPdf) return;
514
+
515
+ pdfContainer.innerHTML = '<div class="loading"><div class="loading-spinner"></div><div>Rendering PDF...</div></div>';
516
+
517
+ const page = await state.currentPdf.getPage(1);
518
+ const baseScale = 1.5;
519
+ const scale = baseScale * state.zoomLevel;
520
+ const viewport = page.getViewport({ scale });
521
+
522
+ const canvas = document.createElement('canvas');
523
+ canvas.id = 'pdf-canvas';
524
+ const context = canvas.getContext('2d');
525
+ canvas.height = viewport.height;
526
+ canvas.width = viewport.width;
527
+
528
+ const renderContext = {
529
+ canvasContext: context,
530
+ viewport: viewport
531
+ };
532
+
533
+ await page.render(renderContext).promise;
534
+
535
+ pdfContainer.innerHTML = '';
536
+ pdfContainer.appendChild(canvas);
537
+
538
+ // Update zoom level display
539
+ zoomLevel.textContent = Math.round(state.zoomLevel * 100) + '%';
540
+ }
541
+
542
+ async function loadOcrText(record) {
543
+ ocrText.value = '';
544
+ ocrText.disabled = true;
545
+ exportBtn.disabled = true;
546
+ state.hasEdits = false;
547
+
548
+ const loadingDiv = document.createElement('div');
549
+ loadingDiv.className = 'loading';
550
+ loadingDiv.innerHTML = '<div class="loading-spinner"></div><div>Loading OCR text...</div>';
551
+ ocrText.parentElement.appendChild(loadingDiv);
552
+
553
+ try {
554
+ let text = '';
555
+
556
+ if (state.currentOcrSource === 'textract') {
557
+ text = await fetchTextractOcr(record.textract_ocr_file);
558
+ } else {
559
+ text = await fetchLocOcr(record.loc_altoxml_path);
560
+ }
561
+
562
+ state.originalOcrText = text;
563
+ ocrText.value = text;
564
+ ocrText.disabled = false;
565
+ exportBtn.disabled = false;
566
+ loadingDiv.remove();
567
+ } catch (error) {
568
+ console.error('OCR loading error:', error);
569
+ loadingDiv.innerHTML = '<div class="error-message">Failed to load OCR text: ' + error.message + '</div>';
570
+ setTimeout(() => loadingDiv.remove(), 3000);
571
+ }
572
+ }
573
+
574
+ async function fetchTextractOcr(filePath) {
575
+ const response = await fetch(filePath);
576
+ if (!response.ok) {
577
+ throw new Error(`HTTP error! status: ${response.status}`);
578
+ }
579
+ return await response.text();
580
+ }
581
+
582
+ async function fetchLocOcr(xmlPath) {
583
+ const response = await fetch(xmlPath);
584
+ if (!response.ok) {
585
+ throw new Error(`HTTP error! status: ${response.status}`);
586
+ }
587
+ const xmlText = await response.text();
588
+ return parseAltoXml(xmlText);
589
+ }
590
+
591
+ function parseAltoXml(xmlText) {
592
+ const parser = new DOMParser();
593
+ const xmlDoc = parser.parseFromString(xmlText, 'text/xml');
594
+
595
+ // Check for parsing errors
596
+ const parserError = xmlDoc.querySelector('parsererror');
597
+ if (parserError) {
598
+ throw new Error('XML parsing error');
599
+ }
600
+
601
+ const strings = xmlDoc.getElementsByTagName('String');
602
+ const textLines = [];
603
+ let currentLine = [];
604
+ let lastVPos = null;
605
+
606
+ for (let i = 0; i < strings.length; i++) {
607
+ const stringElement = strings[i];
608
+ const content = stringElement.getAttribute('CONTENT');
609
+ const vpos = stringElement.getAttribute('VPOS');
610
+
611
+ if (content) {
612
+ // Check if we're on a new line (VPOS changed significantly)
613
+ if (lastVPos !== null && vpos !== null) {
614
+ const vposDiff = Math.abs(parseInt(vpos) - parseInt(lastVPos));
615
+ if (vposDiff > 50) { // Threshold for new line
616
+ if (currentLine.length > 0) {
617
+ textLines.push(currentLine.join(' '));
618
+ currentLine = [];
619
+ }
620
+ }
621
+ }
622
+
623
+ currentLine.push(content);
624
+ lastVPos = vpos;
625
+ }
626
+ }
627
+
628
+ // Add the last line
629
+ if (currentLine.length > 0) {
630
+ textLines.push(currentLine.join(' '));
631
+ }
632
+
633
+ return textLines.join('\n');
634
+ }
635
+
636
+ function exportText() {
637
+ if (!state.currentRecord) return;
638
+
639
+ const text = ocrText.value;
640
+ const record = state.currentRecord;
641
+
642
+ // Sanitize filename
643
+ const sanitizedTitle = record.newspaper_title
644
+ .replace(/[^a-z0-9]/gi, '_')
645
+ .replace(/_+/g, '_')
646
+ .toLowerCase();
647
+
648
+ const filename = `${record.id}_${sanitizedTitle}_${record.publication_date}.txt`;
649
+
650
+ // Create blob and download
651
+ const blob = new Blob([text], { type: 'text/plain' });
652
+ const url = URL.createObjectURL(blob);
653
+ const a = document.createElement('a');
654
+ a.href = url;
655
+ a.download = filename;
656
+ document.body.appendChild(a);
657
+ a.click();
658
+ document.body.removeChild(a);
659
+ URL.revokeObjectURL(url);
660
+ }
661
+
662
+ function zoomPdf(delta) {
663
+ // Store current scroll position as percentage
664
+ const scrollLeftPercent = pdfContainer.scrollLeft / (pdfContainer.scrollWidth - pdfContainer.clientWidth);
665
+ const scrollTopPercent = pdfContainer.scrollTop / (pdfContainer.scrollHeight - pdfContainer.clientHeight);
666
+
667
+ // Update zoom level (limit between 0.5x and 5x)
668
+ state.zoomLevel = Math.max(0.5, Math.min(5, state.zoomLevel + delta));
669
+
670
+ // Re-render PDF
671
+ renderPdf().then(() => {
672
+ // Restore scroll position
673
+ pdfContainer.scrollLeft = scrollLeftPercent * (pdfContainer.scrollWidth - pdfContainer.clientWidth);
674
+ pdfContainer.scrollTop = scrollTopPercent * (pdfContainer.scrollHeight - pdfContainer.clientHeight);
675
+ });
676
+ }
677
+
678
+ function resetZoom() {
679
+ state.zoomLevel = 1.0;
680
+ renderPdf();
681
+ }
682
+
683
+ function setupPanControls() {
684
+ let isMouseDown = false;
685
+ let startX, startY, scrollLeft, scrollTop;
686
+
687
+ pdfContainer.addEventListener('mousedown', (e) => {
688
+ // Only pan if we clicked on the container or canvas
689
+ if (e.target === pdfContainer || e.target.id === 'pdf-canvas') {
690
+ isMouseDown = true;
691
+ pdfContainer.classList.remove('grabbable');
692
+ pdfContainer.classList.add('grabbing');
693
+
694
+ startX = e.pageX - pdfContainer.offsetLeft;
695
+ startY = e.pageY - pdfContainer.offsetTop;
696
+ scrollLeft = pdfContainer.scrollLeft;
697
+ scrollTop = pdfContainer.scrollTop;
698
+ }
699
+ });
700
+
701
+ pdfContainer.addEventListener('mouseleave', () => {
702
+ if (isMouseDown) {
703
+ isMouseDown = false;
704
+ pdfContainer.classList.remove('grabbing');
705
+ pdfContainer.classList.add('grabbable');
706
+ }
707
+ });
708
+
709
+ pdfContainer.addEventListener('mouseup', () => {
710
+ if (isMouseDown) {
711
+ isMouseDown = false;
712
+ pdfContainer.classList.remove('grabbing');
713
+ pdfContainer.classList.add('grabbable');
714
+ }
715
+ });
716
+
717
+ pdfContainer.addEventListener('mousemove', (e) => {
718
+ if (!isMouseDown) return;
719
+ e.preventDefault();
720
+
721
+ const x = e.pageX - pdfContainer.offsetLeft;
722
+ const y = e.pageY - pdfContainer.offsetTop;
723
+ const walkX = (x - startX) * 1.5; // Multiply for faster scrolling
724
+ const walkY = (y - startY) * 1.5;
725
+
726
+ pdfContainer.scrollLeft = scrollLeft - walkX;
727
+ pdfContainer.scrollTop = scrollTop - walkY;
728
+ });
729
+ }
730
+
731
+ // Initialize on page load
732
+ init();
733
+ </script>
734
+ </body>
735
+ </html>
loc_ocr/00001.xml ADDED
The diff for this file is too large to render. See raw diff
 
pdfs/00001.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73edc82b6c773bef7b9759de0b96c4d7f7283534b19ea8979d38bf727ccbf2e7
3
+ size 755765
pdfs/00547.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a48221cfc415b70f74d071dfa18988483295099c0d420dd03a6379b969331e39
3
+ size 199392
pdfs/00923.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7146759a27b60f0b297b23c744da68ce6075031ecba2df6a7995345388ac19f
3
+ size 237203
pdfs/01063.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfb9199879f777ab5e87c66efe4ce07e31573c91905987181749a5db0d3100d3
3
+ size 261664
pdfs/01405.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc7b7857a593c7a9817181c9c5344334bd6b99539468c2736f0d8c30bf52eb90
3
+ size 305304
pdfs/01608.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cd86a0341dcc61e809a8ff6589dab8555ce70d3de59e0727c8aed2656d92ab3
3
+ size 311837
pdfs/01689.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be30e51aa33b8cec19d02d836a144f45b4574dc05e962328fa6ea02d9f205799
3
+ size 320123
pdfs/01718.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68770ee45704f7e14c110b6122b75be9b126a7d162b03ffeb771d4a5ad909149
3
+ size 308300
pdfs/01786.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5542b99dff7466796e4f1303f4618c5e72dfcc6402a7d918e049b97f1a66a648
3
+ size 305909
pdfs/02440.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85324033a843db80febe42ee429e2e5bd10f274f14fd918787d8cd17cc9fc5d5
3
+ size 298978
pdfs/02999.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:181e6a25941c5ca517078629900362d7cd687f15129878ad126a0bf6cdb37b80
3
+ size 442692
pdfs/03141.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de48d073e276fb3029de3b94b5bcf9301d97fe999d5085d1ce27fa7ca50cd0cb
3
+ size 1131187
pdfs/03528.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3000ce0b153b21b00f24e756074dd371cb9b70f4f89851537efb7f11ffbbc927
3
+ size 1079706
pdfs/04184.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:837918fddb438c6b9929b43dd2908bea638546e6456f518bc798983e9447ba0d
3
+ size 1717251
pdfs/05082.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76abd62e3c6aec2137cca4980e66dab3e1a04f0552e236cb40cc3aff923e4e5
3
+ size 1290013
pdfs/06035.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21cb9983aa6e1831af1c7610e7a2466392ad0adec6cf79e53ea35838d1d1e4f9
3
+ size 2949080
pdfs/06145.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc4a2a5cf0434b18cad2894755acd21be598f4a27924f74ad9deb4f437eba960
3
+ size 3149548
pdfs/07555.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a57653f087e3c8092d77f89ee5a3c5d1aaca2f73bda4a341a7182b0f10f2fe03
3
+ size 1874688
pdfs/07787.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0bfd5400294c8c8c4a4c8b1ea8e11547d91add09866260e5f0e2fa449e97646
3
+ size 1667089
pdfs/09426.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f01af6d3529c7d465bac50bc6f615a816166c26702fd140df3c5109cd2c37800
3
+ size 3234593
pdfs/10261.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1af303780c427a43706db79f795dc3e9f7408a8a339c191237f4dfb57adf4248
3
+ size 4371190
pdfs/10875.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1df1abc03b86b980d4e600e182ebf05c25f38a39cd4ff8c25c899c5130d8f6f
3
+ size 2062854
pdfs/11901.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cade9ddeb44a9f158b3d39a95f9ce8e21bd077fd0ba341d39faa41c92290bce
3
+ size 2083303
pdfs/12099.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0186a74e10e3d7229ec6572d4410ad51efa76f7fd1801ac1b31de076b1233e0c
3
+ size 1965570
pdfs/12747.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:356cde18ff536b5de1290752ac2d897e2ab3d44a0e3c9d56303965d6cbf5a2b7
3
+ size 5162920
pdfs/12827.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eec4dcb7c204ef52bbeb7fe281f1f772b6377e624357adaa27d4adbccfe0e5ca
3
+ size 4786829
pdfs/12964.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6b75a66ab81568055e410cd1e15834e1e2b5c706865f15d0c9a89baeea180c8
3
+ size 5313714
pdfs/13089.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e58e9b3638991ae5be837fe944637c6dbbe276f899d6393d3f9250648b61c30
3
+ size 5218234
pdfs/14825.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aba674ffa5124bece8b86d4f145c588d7b6c4b24aff8f65a191cf3e438aa61d6
3
+ size 5778052
pdfs/15210.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d521c7da2bbf7b1c867b6652d77545e879e7cadf68a76ee1acc2df395546136a
3
+ size 5727644
pdfs/16124.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:880569b7b5b414dc2a595782cffcb3751f372a10dbb320cdea52e67a6f4ba2da
3
+ size 2514563
pdfs/16999.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08a37f3a27981236348de0eb43350d93fa00876c20204e9d846c7a5daeece8e8
3
+ size 3584503
pdfs/17628.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8501e04367d8387c74122647b8cfb1c16649b15303870147d0fd5bf0ecc2e227
3
+ size 3250142
pdfs/17885.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9b8bf4722711d4d0a8cc077284f28c6eb802a4881cab7e5cadd944556d8e9a1
3
+ size 3718836
pdfs/19652.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c6d0978c509df5a101c0fa6ead2be5ec9fcc0db49f51e35469fbb3fa7e82a57
3
+ size 3319886
pdfs/19818.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f86fa4af82dcd3e98bb98f97ec0155fdf0e37240b53a51c70ab1d559d8c7fc8d
3
+ size 2759018
pdfs/19915.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83a29266dda5027cdda85c2f978fd3f3c31a54307245c59391d5f78ef30f1b90
3
+ size 2251538
pdfs/22785.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bd027e392232763f19f3753d8231c10e27b200658a5e0f76721370693a12667
3
+ size 2026182
pdfs/22939.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:740238d83b2c6bf71514d218f9d363bf82112e13ec86c4e3066111f8db11b067
3
+ size 1969624
pdfs/23092.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe6a3f6d7680439df7f894b7aaa1a8b07fbbd92ff939bfef343edac8638cbd66
3
+ size 1977563
pdfs/24047.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a0ebe110729e8c98cc937d2bf2243b006f9dfac37e8113a30f4f67626ce7f3b
3
+ size 1952343
pdfs/25152.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a61e85f7ca2324fcd28a7d1e21c488a955bc898c6742ac068aa981d4d67544e
3
+ size 4972673
pdfs/25532.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc28f3cafda7483f230bc2801569480fca4fe582ac6309a554194aef2f5f1af2
3
+ size 4092882
pdfs/25615.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbaf54f85663b010f612e8e3e0e29ef20c15c3d12c89922433742952ce242359
3
+ size 3597146
pdfs/26136.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d7c095806cbb055e1bbcc4ac5c1cc951a2d870ab96545990a086ccacdec80e5
3
+ size 2082937
pdfs/28406.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e0dbd447a35377e365d3ea6f96ac8a189c159c512675015217865712856ecbe
3
+ size 2233416