|
|
<!DOCTYPE html> |
|
|
<html lang="en"> |
|
|
<head> |
|
|
<meta charset="UTF-8"> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
|
<title>Historical Newspaper OCR Viewer</title> |
|
|
|
|
|
|
|
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script> |
|
|
|
|
|
<style> |
|
|
* { |
|
|
margin: 0; |
|
|
padding: 0; |
|
|
box-sizing: border-box; |
|
|
} |
|
|
|
|
|
body { |
|
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif; |
|
|
height: 100vh; |
|
|
overflow: hidden; |
|
|
background-color: #f5f5f5; |
|
|
} |
|
|
|
|
|
.header { |
|
|
background-color: #2c3e50; |
|
|
color: white; |
|
|
padding: 15px 20px; |
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
|
} |
|
|
|
|
|
.header h1 { |
|
|
font-size: 24px; |
|
|
margin-bottom: 10px; |
|
|
} |
|
|
|
|
|
.controls { |
|
|
display: flex; |
|
|
gap: 15px; |
|
|
align-items: center; |
|
|
flex-wrap: wrap; |
|
|
} |
|
|
|
|
|
.select-container { |
|
|
flex: 1; |
|
|
max-width: 800px; |
|
|
} |
|
|
|
|
|
.select-container select { |
|
|
width: 100%; |
|
|
padding: 8px 12px; |
|
|
border: 1px solid #ddd; |
|
|
border-radius: 4px; |
|
|
font-size: 14px; |
|
|
background-color: white; |
|
|
} |
|
|
|
|
|
.metadata { |
|
|
background-color: #ecf0f1; |
|
|
padding: 15px 20px; |
|
|
border-bottom: 1px solid #bdc3c7; |
|
|
display: none; |
|
|
} |
|
|
|
|
|
.metadata.active { |
|
|
display: block; |
|
|
} |
|
|
|
|
|
.metadata-content { |
|
|
display: flex; |
|
|
gap: 20px; |
|
|
align-items: center; |
|
|
flex-wrap: wrap; |
|
|
} |
|
|
|
|
|
.metadata-item { |
|
|
font-size: 14px; |
|
|
} |
|
|
|
|
|
.metadata-item strong { |
|
|
color: #2c3e50; |
|
|
} |
|
|
|
|
|
.metadata-item a { |
|
|
color: #3498db; |
|
|
text-decoration: none; |
|
|
} |
|
|
|
|
|
.metadata-item a:hover { |
|
|
text-decoration: underline; |
|
|
} |
|
|
|
|
|
.main-container { |
|
|
display: flex; |
|
|
height: calc(100vh - 180px); |
|
|
background-color: white; |
|
|
} |
|
|
|
|
|
.pdf-panel, .text-panel { |
|
|
width: 50%; |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
border-right: 1px solid #ddd; |
|
|
} |
|
|
|
|
|
.text-panel { |
|
|
border-right: none; |
|
|
} |
|
|
|
|
|
.panel-header { |
|
|
background-color: #34495e; |
|
|
color: white; |
|
|
padding: 12px 20px; |
|
|
font-weight: 600; |
|
|
font-size: 14px; |
|
|
display: flex; |
|
|
justify-content: space-between; |
|
|
align-items: center; |
|
|
} |
|
|
|
|
|
|
|
|
.search-toolbar { |
|
|
background-color: #ecf0f1; |
|
|
padding: 8px 15px; |
|
|
display: flex; |
|
|
gap: 8px; |
|
|
align-items: center; |
|
|
border-bottom: 1px solid #bdc3c7; |
|
|
} |
|
|
|
|
|
.search-toolbar input[type="text"] { |
|
|
padding: 4px 8px; |
|
|
border: 1px solid #bdc3c7; |
|
|
border-radius: 3px; |
|
|
font-size: 13px; |
|
|
width: 140px; |
|
|
} |
|
|
|
|
|
.search-toolbar button { |
|
|
padding: 4px 10px; |
|
|
background-color: #fff; |
|
|
border: 1px solid #bdc3c7; |
|
|
border-radius: 3px; |
|
|
cursor: pointer; |
|
|
font-size: 12px; |
|
|
color: #2c3e50; |
|
|
transition: all 0.2s; |
|
|
} |
|
|
|
|
|
.search-toolbar button:hover:not(:disabled) { |
|
|
background-color: #e8e8e8; |
|
|
border-color: #95a5a6; |
|
|
} |
|
|
|
|
|
.search-toolbar button:disabled { |
|
|
opacity: 0.5; |
|
|
cursor: not-allowed; |
|
|
} |
|
|
|
|
|
.search-toolbar label { |
|
|
display: flex; |
|
|
align-items: center; |
|
|
gap: 4px; |
|
|
font-size: 12px; |
|
|
color: #2c3e50; |
|
|
cursor: pointer; |
|
|
user-select: none; |
|
|
} |
|
|
|
|
|
.search-toolbar input[type="checkbox"] { |
|
|
cursor: pointer; |
|
|
} |
|
|
|
|
|
.search-msg { |
|
|
font-size: 12px; |
|
|
color: #7f8c8d; |
|
|
margin-left: auto; |
|
|
} |
|
|
|
|
|
.pdf-controls { |
|
|
display: flex; |
|
|
gap: 10px; |
|
|
align-items: center; |
|
|
} |
|
|
|
|
|
.zoom-btn { |
|
|
background-color: #2c3e50; |
|
|
color: white; |
|
|
border: none; |
|
|
padding: 5px 12px; |
|
|
border-radius: 4px; |
|
|
cursor: pointer; |
|
|
font-size: 16px; |
|
|
font-weight: bold; |
|
|
transition: background-color 0.2s; |
|
|
min-width: 32px; |
|
|
} |
|
|
|
|
|
.zoom-btn:hover:not(:disabled) { |
|
|
background-color: #1a252f; |
|
|
} |
|
|
|
|
|
.zoom-btn:disabled { |
|
|
background-color: #95a5a6; |
|
|
cursor: not-allowed; |
|
|
opacity: 0.5; |
|
|
} |
|
|
|
|
|
.zoom-level { |
|
|
font-size: 13px; |
|
|
min-width: 50px; |
|
|
text-align: center; |
|
|
} |
|
|
|
|
|
.ocr-controls { |
|
|
display: flex; |
|
|
gap: 15px; |
|
|
align-items: center; |
|
|
} |
|
|
|
|
|
.ocr-toggle { |
|
|
display: flex; |
|
|
gap: 10px; |
|
|
background-color: #2c3e50; |
|
|
padding: 5px; |
|
|
border-radius: 4px; |
|
|
} |
|
|
|
|
|
.ocr-toggle label { |
|
|
padding: 5px 12px; |
|
|
cursor: pointer; |
|
|
border-radius: 3px; |
|
|
font-size: 13px; |
|
|
transition: background-color 0.2s; |
|
|
} |
|
|
|
|
|
.ocr-toggle input[type="radio"] { |
|
|
display: none; |
|
|
} |
|
|
|
|
|
.ocr-toggle input[type="radio"]:checked + span { |
|
|
background-color: #3498db; |
|
|
} |
|
|
|
|
|
.ocr-toggle span { |
|
|
padding: 5px 12px; |
|
|
border-radius: 3px; |
|
|
font-size: 13px; |
|
|
} |
|
|
|
|
|
.export-btn { |
|
|
background-color: #27ae60; |
|
|
color: white; |
|
|
border: none; |
|
|
padding: 6px 16px; |
|
|
border-radius: 4px; |
|
|
cursor: pointer; |
|
|
font-size: 13px; |
|
|
font-weight: 600; |
|
|
transition: background-color 0.2s; |
|
|
} |
|
|
|
|
|
.export-btn:hover { |
|
|
background-color: #229954; |
|
|
} |
|
|
|
|
|
.export-btn:disabled { |
|
|
background-color: #95a5a6; |
|
|
cursor: not-allowed; |
|
|
} |
|
|
|
|
|
.panel-content { |
|
|
flex: 1; |
|
|
overflow: auto; |
|
|
position: relative; |
|
|
} |
|
|
|
|
|
.panel-content.grabbable { |
|
|
cursor: grab; |
|
|
} |
|
|
|
|
|
.panel-content.grabbing { |
|
|
cursor: grabbing; |
|
|
} |
|
|
|
|
|
#pdf-canvas { |
|
|
display: block; |
|
|
margin: 20px auto; |
|
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1); |
|
|
} |
|
|
|
|
|
#ocr-text { |
|
|
width: 100%; |
|
|
height: 100%; |
|
|
border: none; |
|
|
padding: 20px; |
|
|
font-family: 'Courier New', monospace; |
|
|
font-size: 14px; |
|
|
line-height: 1.6; |
|
|
resize: none; |
|
|
outline: none; |
|
|
} |
|
|
|
|
|
.loading { |
|
|
position: absolute; |
|
|
top: 50%; |
|
|
left: 50%; |
|
|
transform: translate(-50%, -50%); |
|
|
text-align: center; |
|
|
color: #7f8c8d; |
|
|
} |
|
|
|
|
|
.loading-spinner { |
|
|
border: 4px solid #ecf0f1; |
|
|
border-top: 4px solid #3498db; |
|
|
border-radius: 50%; |
|
|
width: 40px; |
|
|
height: 40px; |
|
|
animation: spin 1s linear infinite; |
|
|
margin: 0 auto 10px; |
|
|
} |
|
|
|
|
|
@keyframes spin { |
|
|
0% { transform: rotate(0deg); } |
|
|
100% { transform: rotate(360deg); } |
|
|
} |
|
|
|
|
|
.error-message { |
|
|
color: #e74c3c; |
|
|
padding: 20px; |
|
|
text-align: center; |
|
|
} |
|
|
|
|
|
.empty-state { |
|
|
text-align: center; |
|
|
color: #95a5a6; |
|
|
padding: 40px; |
|
|
font-size: 16px; |
|
|
} |
|
|
|
|
|
.empty-state-icon { |
|
|
font-size: 48px; |
|
|
margin-bottom: 10px; |
|
|
} |
|
|
</style> |
|
|
</head> |
|
|
<body> |
|
|
<div class="header"> |
|
|
<h1>Historical Newspaper OCR Viewer</h1> |
|
|
<div class="controls"> |
|
|
<div class="select-container"> |
|
|
<select id="record-select" disabled> |
|
|
<option value="">Loading records...</option> |
|
|
</select> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div class="metadata" id="metadata"> |
|
|
<div class="metadata-content"> |
|
|
<div class="metadata-item"> |
|
|
<strong>Title:</strong> <span id="meta-title">-</span> |
|
|
</div> |
|
|
<div class="metadata-item"> |
|
|
<strong>Date:</strong> <span id="meta-date">-</span> |
|
|
</div> |
|
|
<div class="metadata-item"> |
|
|
<strong>Page:</strong> <span id="meta-page">-</span> |
|
|
</div> |
|
|
<div class="metadata-item"> |
|
|
<a id="meta-link" href="#" target="_blank">View on Library of Congress →</a> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div class="main-container"> |
|
|
<div class="pdf-panel"> |
|
|
<div class="panel-header"> |
|
|
<span>PDF Viewer</span> |
|
|
<div class="pdf-controls"> |
|
|
<button class="zoom-btn" id="zoom-out-btn" disabled title="Zoom Out">−</button> |
|
|
<span class="zoom-level" id="zoom-level">100%</span> |
|
|
<button class="zoom-btn" id="zoom-in-btn" disabled title="Zoom In">+</button> |
|
|
<button class="zoom-btn" id="zoom-reset-btn" disabled title="Reset Zoom">Reset</button> |
|
|
</div> |
|
|
</div> |
|
|
<div class="panel-content" id="pdf-container"> |
|
|
<div class="empty-state"> |
|
|
<div class="empty-state-icon">📄</div> |
|
|
<div>Select a record to view the PDF</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div class="text-panel"> |
|
|
<div class="panel-header"> |
|
|
<span>OCR Text</span> |
|
|
<div class="ocr-controls"> |
|
|
<div class="ocr-toggle"> |
|
|
<label> |
|
|
<input type="radio" name="ocr-source" value="textract" checked> |
|
|
<span>New OCR</span> |
|
|
</label> |
|
|
<label> |
|
|
<input type="radio" name="ocr-source" value="loc"> |
|
|
<span>Original OCR</span> |
|
|
</label> |
|
|
</div> |
|
|
<button class="export-btn" id="export-btn" disabled>Export Text</button> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="search-toolbar"> |
|
|
<input type="text" id="find-input" placeholder="Find..." disabled> |
|
|
<input type="text" id="replace-input" placeholder="Replace with..." disabled> |
|
|
<button id="find-next-btn" disabled title="Find Next Occurrence">Find Next</button> |
|
|
<button id="replace-btn" disabled title="Replace Current Selection">Replace</button> |
|
|
<button id="replace-all-btn" disabled title="Replace All Occurrences">Replace All</button> |
|
|
<label> |
|
|
<input type="checkbox" id="case-sensitive-cb" disabled> |
|
|
Match Case |
|
|
</label> |
|
|
<span id="search-msg" class="search-msg"></span> |
|
|
</div> |
|
|
|
|
|
<div class="panel-content"> |
|
|
<textarea |
|
|
id="ocr-text" |
|
|
placeholder="Select a record and OCR source to view text..." |
|
|
disabled |
|
|
></textarea> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<script> |
|
|
|
|
|
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js'; |
|
|
|
|
|
|
|
|
const state = { |
|
|
records: [], |
|
|
currentRecord: null, |
|
|
currentOcrSource: 'textract', |
|
|
originalOcrText: '', |
|
|
hasEdits: false, |
|
|
isLoading: false, |
|
|
currentPdf: null, |
|
|
zoomLevel: 1.0 |
|
|
}; |
|
|
|
|
|
|
|
|
const recordSelect = document.getElementById('record-select'); |
|
|
const ocrText = document.getElementById('ocr-text'); |
|
|
const exportBtn = document.getElementById('export-btn'); |
|
|
const pdfContainer = document.getElementById('pdf-container'); |
|
|
const metadata = document.getElementById('metadata'); |
|
|
const ocrRadios = document.querySelectorAll('input[name="ocr-source"]'); |
|
|
const zoomInBtn = document.getElementById('zoom-in-btn'); |
|
|
const zoomOutBtn = document.getElementById('zoom-out-btn'); |
|
|
const zoomResetBtn = document.getElementById('zoom-reset-btn'); |
|
|
const zoomLevel = document.getElementById('zoom-level'); |
|
|
|
|
|
|
|
|
const findInput = document.getElementById('find-input'); |
|
|
const replaceInput = document.getElementById('replace-input'); |
|
|
const findNextBtn = document.getElementById('find-next-btn'); |
|
|
const replaceBtn = document.getElementById('replace-btn'); |
|
|
const replaceAllBtn = document.getElementById('replace-all-btn'); |
|
|
const caseSensitiveCb = document.getElementById('case-sensitive-cb'); |
|
|
const searchMsg = document.getElementById('search-msg'); |
|
|
|
|
|
|
|
|
async function init() { |
|
|
try { |
|
|
const response = await fetch('data.json'); |
|
|
if (!response.ok) { |
|
|
throw new Error('Failed to load data.json'); |
|
|
} |
|
|
state.records = await response.json(); |
|
|
|
|
|
populateRecordSelect(); |
|
|
enableControls(); |
|
|
setupEventListeners(); |
|
|
} catch (error) { |
|
|
console.error('Initialization error:', error); |
|
|
recordSelect.innerHTML = '<option value="">Error loading records</option>'; |
|
|
alert('Failed to load data.json. Please ensure the file exists in the same directory as index.html.'); |
|
|
} |
|
|
} |
|
|
|
|
|
function enableControls() { |
|
|
recordSelect.disabled = false; |
|
|
} |
|
|
|
|
|
function populateRecordSelect() { |
|
|
recordSelect.innerHTML = '<option value="">Select a record...</option>'; |
|
|
|
|
|
state.records.forEach(record => { |
|
|
const option = document.createElement('option'); |
|
|
option.value = record.id; |
|
|
option.textContent = `${record.id} - ${record.newspaper_title} ${record.publication_date} Page ${record.page}`; |
|
|
recordSelect.appendChild(option); |
|
|
}); |
|
|
} |
|
|
|
|
|
function setupEventListeners() { |
|
|
|
|
|
recordSelect.addEventListener('change', (e) => { |
|
|
const recordId = e.target.value; |
|
|
if (recordId) { |
|
|
const record = state.records.find(r => r.id === recordId); |
|
|
loadRecord(record); |
|
|
} |
|
|
}); |
|
|
|
|
|
|
|
|
ocrRadios.forEach(radio => { |
|
|
radio.addEventListener('change', (e) => { |
|
|
if (state.hasEdits) { |
|
|
const confirmed = confirm('You have unsaved edits. Switching OCR source will discard your changes. Continue?'); |
|
|
if (!confirmed) { |
|
|
|
|
|
document.querySelector(`input[name="ocr-source"][value="${state.currentOcrSource}"]`).checked = true; |
|
|
return; |
|
|
} |
|
|
} |
|
|
state.currentOcrSource = e.target.value; |
|
|
if (state.currentRecord) { |
|
|
loadOcrText(state.currentRecord); |
|
|
} |
|
|
}); |
|
|
}); |
|
|
|
|
|
|
|
|
ocrText.addEventListener('input', () => { |
|
|
state.hasEdits = ocrText.value !== state.originalOcrText; |
|
|
}); |
|
|
|
|
|
|
|
|
exportBtn.addEventListener('click', exportText); |
|
|
|
|
|
|
|
|
zoomInBtn.addEventListener('click', () => zoomPdf(0.25)); |
|
|
zoomOutBtn.addEventListener('click', () => zoomPdf(-0.25)); |
|
|
zoomResetBtn.addEventListener('click', () => resetZoom()); |
|
|
|
|
|
|
|
|
findNextBtn.addEventListener('click', findNext); |
|
|
replaceBtn.addEventListener('click', replaceCurrent); |
|
|
replaceAllBtn.addEventListener('click', replaceAll); |
|
|
|
|
|
|
|
|
findInput.addEventListener('keypress', (e) => { |
|
|
if (e.key === 'Enter') { |
|
|
findNext(); |
|
|
} |
|
|
}); |
|
|
|
|
|
|
|
|
setupPanControls(); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
function findNext() { |
|
|
const query = findInput.value; |
|
|
if (!query) return; |
|
|
|
|
|
const isCaseSensitive = caseSensitiveCb.checked; |
|
|
const text = ocrText.value; |
|
|
const startPos = ocrText.selectionEnd; |
|
|
|
|
|
let nextPos = -1; |
|
|
|
|
|
|
|
|
if (isCaseSensitive) { |
|
|
nextPos = text.indexOf(query, startPos); |
|
|
if (nextPos === -1) { |
|
|
nextPos = text.indexOf(query, 0); |
|
|
searchMsg.textContent = nextPos !== -1 ? "Wrapped to top" : "Not found"; |
|
|
} else { |
|
|
searchMsg.textContent = ""; |
|
|
} |
|
|
} else { |
|
|
const lowerText = text.toLowerCase(); |
|
|
const lowerQuery = query.toLowerCase(); |
|
|
nextPos = lowerText.indexOf(lowerQuery, startPos); |
|
|
if (nextPos === -1) { |
|
|
nextPos = lowerText.indexOf(lowerQuery, 0); |
|
|
searchMsg.textContent = nextPos !== -1 ? "Wrapped to top" : "Not found"; |
|
|
} else { |
|
|
searchMsg.textContent = ""; |
|
|
} |
|
|
} |
|
|
|
|
|
if (nextPos !== -1) { |
|
|
|
|
|
ocrText.focus(); |
|
|
ocrText.setSelectionRange(nextPos, nextPos + query.length); |
|
|
|
|
|
|
|
|
|
|
|
const progress = nextPos / text.length; |
|
|
|
|
|
|
|
|
const scrollTarget = (ocrText.scrollHeight * progress) - (ocrText.clientHeight / 2); |
|
|
|
|
|
ocrText.scrollTop = scrollTarget; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
setTimeout(() => { |
|
|
ocrText.blur(); |
|
|
ocrText.focus(); |
|
|
}, 10); |
|
|
} |
|
|
} |
|
|
|
|
|
function replaceCurrent() { |
|
|
const query = findInput.value; |
|
|
const replacement = replaceInput.value; |
|
|
if (!query) return; |
|
|
|
|
|
const isCaseSensitive = caseSensitiveCb.checked; |
|
|
|
|
|
|
|
|
const start = ocrText.selectionStart; |
|
|
const end = ocrText.selectionEnd; |
|
|
const selectedText = ocrText.value.substring(start, end); |
|
|
|
|
|
let match = false; |
|
|
if (isCaseSensitive) { |
|
|
match = selectedText === query; |
|
|
} else { |
|
|
match = selectedText.toLowerCase() === query.toLowerCase(); |
|
|
} |
|
|
|
|
|
if (match) { |
|
|
|
|
|
ocrText.setRangeText(replacement, start, end, 'end'); |
|
|
state.hasEdits = true; |
|
|
searchMsg.textContent = "Replaced"; |
|
|
|
|
|
|
|
|
findNext(); |
|
|
} else { |
|
|
|
|
|
findNext(); |
|
|
} |
|
|
} |
|
|
|
|
|
function replaceAll() { |
|
|
const query = findInput.value; |
|
|
const replacement = replaceInput.value; |
|
|
if (!query) return; |
|
|
|
|
|
const isCaseSensitive = caseSensitiveCb.checked; |
|
|
const text = ocrText.value; |
|
|
|
|
|
|
|
|
const escapedQuery = query.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); |
|
|
const flags = isCaseSensitive ? 'g' : 'gi'; |
|
|
const regex = new RegExp(escapedQuery, flags); |
|
|
|
|
|
const matchCount = (text.match(regex) || []).length; |
|
|
|
|
|
if (matchCount > 0) { |
|
|
const newText = text.replace(regex, replacement); |
|
|
ocrText.value = newText; |
|
|
state.hasEdits = true; |
|
|
searchMsg.textContent = `Replaced ${matchCount} occurrences`; |
|
|
} else { |
|
|
searchMsg.textContent = "0 matches found"; |
|
|
} |
|
|
} |
|
|
|
|
|
function updateSearchControls(enabled) { |
|
|
findInput.disabled = !enabled; |
|
|
replaceInput.disabled = !enabled; |
|
|
findNextBtn.disabled = !enabled; |
|
|
replaceBtn.disabled = !enabled; |
|
|
replaceAllBtn.disabled = !enabled; |
|
|
caseSensitiveCb.disabled = !enabled; |
|
|
if (!enabled) { |
|
|
findInput.value = ''; |
|
|
replaceInput.value = ''; |
|
|
caseSensitiveCb.checked = false; |
|
|
searchMsg.textContent = ''; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
async function loadRecord(record) { |
|
|
state.currentRecord = record; |
|
|
state.hasEdits = false; |
|
|
|
|
|
|
|
|
metadata.classList.add('active'); |
|
|
document.getElementById('meta-title').textContent = record.newspaper_title; |
|
|
document.getElementById('meta-date').textContent = record.publication_date; |
|
|
document.getElementById('meta-page').textContent = record.page; |
|
|
document.getElementById('meta-link').href = record.loc_record_url; |
|
|
|
|
|
|
|
|
await Promise.all([ |
|
|
loadPdf(record.pdf_path), |
|
|
loadOcrText(record) |
|
|
]); |
|
|
} |
|
|
|
|
|
async function loadPdf(pdfPath) { |
|
|
pdfContainer.innerHTML = '<div class="loading"><div class="loading-spinner"></div><div>Loading PDF...</div></div>'; |
|
|
|
|
|
try { |
|
|
const loadingTask = pdfjsLib.getDocument(pdfPath); |
|
|
const pdf = await loadingTask.promise; |
|
|
state.currentPdf = pdf; |
|
|
state.zoomLevel = 1.0; |
|
|
|
|
|
await renderPdf(); |
|
|
|
|
|
|
|
|
zoomInBtn.disabled = false; |
|
|
zoomOutBtn.disabled = false; |
|
|
zoomResetBtn.disabled = false; |
|
|
|
|
|
|
|
|
pdfContainer.classList.add('grabbable'); |
|
|
} catch (error) { |
|
|
console.error('PDF loading error:', error); |
|
|
pdfContainer.innerHTML = '<div class="error-message">Failed to load PDF: ' + error.message + '</div>'; |
|
|
} |
|
|
} |
|
|
|
|
|
async function renderPdf() { |
|
|
if (!state.currentPdf) return; |
|
|
|
|
|
pdfContainer.innerHTML = '<div class="loading"><div class="loading-spinner"></div><div>Rendering PDF...</div></div>'; |
|
|
|
|
|
const page = await state.currentPdf.getPage(1); |
|
|
const baseScale = 1.5; |
|
|
const scale = baseScale * state.zoomLevel; |
|
|
const viewport = page.getViewport({ scale }); |
|
|
|
|
|
const canvas = document.createElement('canvas'); |
|
|
canvas.id = 'pdf-canvas'; |
|
|
const context = canvas.getContext('2d'); |
|
|
canvas.height = viewport.height; |
|
|
canvas.width = viewport.width; |
|
|
|
|
|
const renderContext = { |
|
|
canvasContext: context, |
|
|
viewport: viewport |
|
|
}; |
|
|
|
|
|
await page.render(renderContext).promise; |
|
|
|
|
|
pdfContainer.innerHTML = ''; |
|
|
pdfContainer.appendChild(canvas); |
|
|
|
|
|
|
|
|
zoomLevel.textContent = Math.round(state.zoomLevel * 100) + '%'; |
|
|
} |
|
|
|
|
|
async function loadOcrText(record) { |
|
|
ocrText.value = ''; |
|
|
ocrText.disabled = true; |
|
|
exportBtn.disabled = true; |
|
|
updateSearchControls(false); |
|
|
state.hasEdits = false; |
|
|
|
|
|
const loadingDiv = document.createElement('div'); |
|
|
loadingDiv.className = 'loading'; |
|
|
loadingDiv.innerHTML = '<div class="loading-spinner"></div><div>Loading OCR text...</div>'; |
|
|
ocrText.parentElement.appendChild(loadingDiv); |
|
|
|
|
|
try { |
|
|
let text = ''; |
|
|
|
|
|
if (state.currentOcrSource === 'textract') { |
|
|
text = await fetchTextractOcr(record.textract_ocr_file); |
|
|
} else { |
|
|
text = await fetchLocOcr(record.loc_altoxml_path); |
|
|
} |
|
|
|
|
|
state.originalOcrText = text; |
|
|
ocrText.value = text; |
|
|
ocrText.disabled = false; |
|
|
exportBtn.disabled = false; |
|
|
updateSearchControls(true); |
|
|
loadingDiv.remove(); |
|
|
} catch (error) { |
|
|
console.error('OCR loading error:', error); |
|
|
loadingDiv.innerHTML = '<div class="error-message">Failed to load OCR text: ' + error.message + '</div>'; |
|
|
setTimeout(() => loadingDiv.remove(), 3000); |
|
|
} |
|
|
} |
|
|
|
|
|
async function fetchTextractOcr(filePath) { |
|
|
const response = await fetch(filePath); |
|
|
if (!response.ok) { |
|
|
throw new Error(`HTTP error! status: ${response.status}`); |
|
|
} |
|
|
return await response.text(); |
|
|
} |
|
|
|
|
|
async function fetchLocOcr(xmlPath) { |
|
|
const response = await fetch(xmlPath); |
|
|
if (!response.ok) { |
|
|
throw new Error(`HTTP error! status: ${response.status}`); |
|
|
} |
|
|
const xmlText = await response.text(); |
|
|
return parseAltoXml(xmlText); |
|
|
} |
|
|
|
|
|
function parseAltoXml(xmlText) { |
|
|
const parser = new DOMParser(); |
|
|
const xmlDoc = parser.parseFromString(xmlText, 'text/xml'); |
|
|
|
|
|
|
|
|
const parserError = xmlDoc.querySelector('parsererror'); |
|
|
if (parserError) { |
|
|
throw new Error('XML parsing error'); |
|
|
} |
|
|
|
|
|
const strings = xmlDoc.getElementsByTagName('String'); |
|
|
const textLines = []; |
|
|
let currentLine = []; |
|
|
let lastVPos = null; |
|
|
|
|
|
for (let i = 0; i < strings.length; i++) { |
|
|
const stringElement = strings[i]; |
|
|
const content = stringElement.getAttribute('CONTENT'); |
|
|
const vpos = stringElement.getAttribute('VPOS'); |
|
|
|
|
|
if (content) { |
|
|
|
|
|
if (lastVPos !== null && vpos !== null) { |
|
|
const vposDiff = Math.abs(parseInt(vpos) - parseInt(lastVPos)); |
|
|
if (vposDiff > 50) { |
|
|
if (currentLine.length > 0) { |
|
|
textLines.push(currentLine.join(' ')); |
|
|
currentLine = []; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
currentLine.push(content); |
|
|
lastVPos = vpos; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
if (currentLine.length > 0) { |
|
|
textLines.push(currentLine.join(' ')); |
|
|
} |
|
|
|
|
|
return textLines.join('\n'); |
|
|
} |
|
|
|
|
|
function exportText() { |
|
|
if (!state.currentRecord) return; |
|
|
|
|
|
const text = ocrText.value; |
|
|
const record = state.currentRecord; |
|
|
|
|
|
|
|
|
const sanitizedTitle = record.newspaper_title |
|
|
.replace(/[^a-z0-9]/gi, '_') |
|
|
.replace(/_+/g, '_') |
|
|
.toLowerCase(); |
|
|
|
|
|
const filename = `${record.id}_${sanitizedTitle}_${record.publication_date}.txt`; |
|
|
|
|
|
|
|
|
const blob = new Blob([text], { type: 'text/plain' }); |
|
|
const url = URL.createObjectURL(blob); |
|
|
const a = document.createElement('a'); |
|
|
a.href = url; |
|
|
a.download = filename; |
|
|
document.body.appendChild(a); |
|
|
a.click(); |
|
|
document.body.removeChild(a); |
|
|
URL.revokeObjectURL(url); |
|
|
} |
|
|
|
|
|
function zoomPdf(delta) { |
|
|
|
|
|
const scrollLeftPercent = pdfContainer.scrollLeft / (pdfContainer.scrollWidth - pdfContainer.clientWidth); |
|
|
const scrollTopPercent = pdfContainer.scrollTop / (pdfContainer.scrollHeight - pdfContainer.clientHeight); |
|
|
|
|
|
|
|
|
state.zoomLevel = Math.max(0.5, Math.min(5, state.zoomLevel + delta)); |
|
|
|
|
|
|
|
|
renderPdf().then(() => { |
|
|
|
|
|
pdfContainer.scrollLeft = scrollLeftPercent * (pdfContainer.scrollWidth - pdfContainer.clientWidth); |
|
|
pdfContainer.scrollTop = scrollTopPercent * (pdfContainer.scrollHeight - pdfContainer.clientHeight); |
|
|
}); |
|
|
} |
|
|
|
|
|
function resetZoom() { |
|
|
state.zoomLevel = 1.0; |
|
|
renderPdf(); |
|
|
} |
|
|
|
|
|
function setupPanControls() { |
|
|
let isMouseDown = false; |
|
|
let startX, startY, scrollLeft, scrollTop; |
|
|
|
|
|
pdfContainer.addEventListener('mousedown', (e) => { |
|
|
|
|
|
if (e.target === pdfContainer || e.target.id === 'pdf-canvas') { |
|
|
isMouseDown = true; |
|
|
pdfContainer.classList.remove('grabbable'); |
|
|
pdfContainer.classList.add('grabbing'); |
|
|
|
|
|
startX = e.pageX - pdfContainer.offsetLeft; |
|
|
startY = e.pageY - pdfContainer.offsetTop; |
|
|
scrollLeft = pdfContainer.scrollLeft; |
|
|
scrollTop = pdfContainer.scrollTop; |
|
|
} |
|
|
}); |
|
|
|
|
|
pdfContainer.addEventListener('mouseleave', () => { |
|
|
if (isMouseDown) { |
|
|
isMouseDown = false; |
|
|
pdfContainer.classList.remove('grabbing'); |
|
|
pdfContainer.classList.add('grabbable'); |
|
|
} |
|
|
}); |
|
|
|
|
|
pdfContainer.addEventListener('mouseup', () => { |
|
|
if (isMouseDown) { |
|
|
isMouseDown = false; |
|
|
pdfContainer.classList.remove('grabbing'); |
|
|
pdfContainer.classList.add('grabbable'); |
|
|
} |
|
|
}); |
|
|
|
|
|
pdfContainer.addEventListener('mousemove', (e) => { |
|
|
if (!isMouseDown) return; |
|
|
e.preventDefault(); |
|
|
|
|
|
const x = e.pageX - pdfContainer.offsetLeft; |
|
|
const y = e.pageY - pdfContainer.offsetTop; |
|
|
const walkX = (x - startX) * 1.5; |
|
|
const walkY = (y - startY) * 1.5; |
|
|
|
|
|
pdfContainer.scrollLeft = scrollLeft - walkX; |
|
|
pdfContainer.scrollTop = scrollTop - walkY; |
|
|
}); |
|
|
} |
|
|
|
|
|
|
|
|
init(); |
|
|
</script> |
|
|
</body> |
|
|
</html> |