ocr_correction / index.html
MikeTrizna's picture
Added find and replace functionality, with case sensitivity functionality
c184c36
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Historical Newspaper OCR Viewer</title>
<!-- PDF.js Library -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
height: 100vh;
overflow: hidden;
background-color: #f5f5f5;
}
.header {
background-color: #2c3e50;
color: white;
padding: 15px 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.header h1 {
font-size: 24px;
margin-bottom: 10px;
}
.controls {
display: flex;
gap: 15px;
align-items: center;
flex-wrap: wrap;
}
.select-container {
flex: 1;
max-width: 800px;
}
.select-container select {
width: 100%;
padding: 8px 12px;
border: 1px solid #ddd;
border-radius: 4px;
font-size: 14px;
background-color: white;
}
.metadata {
background-color: #ecf0f1;
padding: 15px 20px;
border-bottom: 1px solid #bdc3c7;
display: none;
}
.metadata.active {
display: block;
}
.metadata-content {
display: flex;
gap: 20px;
align-items: center;
flex-wrap: wrap;
}
.metadata-item {
font-size: 14px;
}
.metadata-item strong {
color: #2c3e50;
}
.metadata-item a {
color: #3498db;
text-decoration: none;
}
.metadata-item a:hover {
text-decoration: underline;
}
.main-container {
display: flex;
height: calc(100vh - 180px);
background-color: white;
}
.pdf-panel, .text-panel {
width: 50%;
display: flex;
flex-direction: column;
border-right: 1px solid #ddd;
}
.text-panel {
border-right: none;
}
.panel-header {
background-color: #34495e;
color: white;
padding: 12px 20px;
font-weight: 600;
font-size: 14px;
display: flex;
justify-content: space-between;
align-items: center;
}
/* Search Toolbar Styles */
.search-toolbar {
background-color: #ecf0f1;
padding: 8px 15px;
display: flex;
gap: 8px;
align-items: center;
border-bottom: 1px solid #bdc3c7;
}
.search-toolbar input[type="text"] {
padding: 4px 8px;
border: 1px solid #bdc3c7;
border-radius: 3px;
font-size: 13px;
width: 140px;
}
.search-toolbar button {
padding: 4px 10px;
background-color: #fff;
border: 1px solid #bdc3c7;
border-radius: 3px;
cursor: pointer;
font-size: 12px;
color: #2c3e50;
transition: all 0.2s;
}
.search-toolbar button:hover:not(:disabled) {
background-color: #e8e8e8;
border-color: #95a5a6;
}
.search-toolbar button:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.search-toolbar label {
display: flex;
align-items: center;
gap: 4px;
font-size: 12px;
color: #2c3e50;
cursor: pointer;
user-select: none;
}
.search-toolbar input[type="checkbox"] {
cursor: pointer;
}
.search-msg {
font-size: 12px;
color: #7f8c8d;
margin-left: auto;
}
.pdf-controls {
display: flex;
gap: 10px;
align-items: center;
}
.zoom-btn {
background-color: #2c3e50;
color: white;
border: none;
padding: 5px 12px;
border-radius: 4px;
cursor: pointer;
font-size: 16px;
font-weight: bold;
transition: background-color 0.2s;
min-width: 32px;
}
.zoom-btn:hover:not(:disabled) {
background-color: #1a252f;
}
.zoom-btn:disabled {
background-color: #95a5a6;
cursor: not-allowed;
opacity: 0.5;
}
.zoom-level {
font-size: 13px;
min-width: 50px;
text-align: center;
}
.ocr-controls {
display: flex;
gap: 15px;
align-items: center;
}
.ocr-toggle {
display: flex;
gap: 10px;
background-color: #2c3e50;
padding: 5px;
border-radius: 4px;
}
.ocr-toggle label {
padding: 5px 12px;
cursor: pointer;
border-radius: 3px;
font-size: 13px;
transition: background-color 0.2s;
}
.ocr-toggle input[type="radio"] {
display: none;
}
.ocr-toggle input[type="radio"]:checked + span {
background-color: #3498db;
}
.ocr-toggle span {
padding: 5px 12px;
border-radius: 3px;
font-size: 13px;
}
.export-btn {
background-color: #27ae60;
color: white;
border: none;
padding: 6px 16px;
border-radius: 4px;
cursor: pointer;
font-size: 13px;
font-weight: 600;
transition: background-color 0.2s;
}
.export-btn:hover {
background-color: #229954;
}
.export-btn:disabled {
background-color: #95a5a6;
cursor: not-allowed;
}
.panel-content {
flex: 1;
overflow: auto;
position: relative;
}
.panel-content.grabbable {
cursor: grab;
}
.panel-content.grabbing {
cursor: grabbing;
}
#pdf-canvas {
display: block;
margin: 20px auto;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
#ocr-text {
width: 100%;
height: 100%;
border: none;
padding: 20px;
font-family: 'Courier New', monospace;
font-size: 14px;
line-height: 1.6;
resize: none;
outline: none;
}
.loading {
position: absolute;
top: 50%;
left: 50%;
transform: translate(-50%, -50%);
text-align: center;
color: #7f8c8d;
}
.loading-spinner {
border: 4px solid #ecf0f1;
border-top: 4px solid #3498db;
border-radius: 50%;
width: 40px;
height: 40px;
animation: spin 1s linear infinite;
margin: 0 auto 10px;
}
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
.error-message {
color: #e74c3c;
padding: 20px;
text-align: center;
}
.empty-state {
text-align: center;
color: #95a5a6;
padding: 40px;
font-size: 16px;
}
.empty-state-icon {
font-size: 48px;
margin-bottom: 10px;
}
</style>
</head>
<body>
<div class="header">
<h1>Historical Newspaper OCR Viewer</h1>
<div class="controls">
<div class="select-container">
<select id="record-select" disabled>
<option value="">Loading records...</option>
</select>
</div>
</div>
</div>
<div class="metadata" id="metadata">
<div class="metadata-content">
<div class="metadata-item">
<strong>Title:</strong> <span id="meta-title">-</span>
</div>
<div class="metadata-item">
<strong>Date:</strong> <span id="meta-date">-</span>
</div>
<div class="metadata-item">
<strong>Page:</strong> <span id="meta-page">-</span>
</div>
<div class="metadata-item">
<a id="meta-link" href="#" target="_blank">View on Library of Congress →</a>
</div>
</div>
</div>
<div class="main-container">
<div class="pdf-panel">
<div class="panel-header">
<span>PDF Viewer</span>
<div class="pdf-controls">
<button class="zoom-btn" id="zoom-out-btn" disabled title="Zoom Out"></button>
<span class="zoom-level" id="zoom-level">100%</span>
<button class="zoom-btn" id="zoom-in-btn" disabled title="Zoom In">+</button>
<button class="zoom-btn" id="zoom-reset-btn" disabled title="Reset Zoom">Reset</button>
</div>
</div>
<div class="panel-content" id="pdf-container">
<div class="empty-state">
<div class="empty-state-icon">📄</div>
<div>Select a record to view the PDF</div>
</div>
</div>
</div>
<div class="text-panel">
<div class="panel-header">
<span>OCR Text</span>
<div class="ocr-controls">
<div class="ocr-toggle">
<label>
<input type="radio" name="ocr-source" value="textract" checked>
<span>New OCR</span>
</label>
<label>
<input type="radio" name="ocr-source" value="loc">
<span>Original OCR</span>
</label>
</div>
<button class="export-btn" id="export-btn" disabled>Export Text</button>
</div>
</div>
<!-- Added Search Toolbar -->
<div class="search-toolbar">
<input type="text" id="find-input" placeholder="Find..." disabled>
<input type="text" id="replace-input" placeholder="Replace with..." disabled>
<button id="find-next-btn" disabled title="Find Next Occurrence">Find Next</button>
<button id="replace-btn" disabled title="Replace Current Selection">Replace</button>
<button id="replace-all-btn" disabled title="Replace All Occurrences">Replace All</button>
<label>
<input type="checkbox" id="case-sensitive-cb" disabled>
Match Case
</label>
<span id="search-msg" class="search-msg"></span>
</div>
<div class="panel-content">
<textarea
id="ocr-text"
placeholder="Select a record and OCR source to view text..."
disabled
></textarea>
</div>
</div>
</div>
<script>
// Configure PDF.js worker
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
// Application state
const state = {
records: [],
currentRecord: null,
currentOcrSource: 'textract',
originalOcrText: '',
hasEdits: false,
isLoading: false,
currentPdf: null,
zoomLevel: 1.0
};
// DOM elements
const recordSelect = document.getElementById('record-select');
const ocrText = document.getElementById('ocr-text');
const exportBtn = document.getElementById('export-btn');
const pdfContainer = document.getElementById('pdf-container');
const metadata = document.getElementById('metadata');
const ocrRadios = document.querySelectorAll('input[name="ocr-source"]');
const zoomInBtn = document.getElementById('zoom-in-btn');
const zoomOutBtn = document.getElementById('zoom-out-btn');
const zoomResetBtn = document.getElementById('zoom-reset-btn');
const zoomLevel = document.getElementById('zoom-level');
// Search DOM elements
const findInput = document.getElementById('find-input');
const replaceInput = document.getElementById('replace-input');
const findNextBtn = document.getElementById('find-next-btn');
const replaceBtn = document.getElementById('replace-btn');
const replaceAllBtn = document.getElementById('replace-all-btn');
const caseSensitiveCb = document.getElementById('case-sensitive-cb');
const searchMsg = document.getElementById('search-msg');
// Initialize application
async function init() {
try {
const response = await fetch('data.json');
if (!response.ok) {
throw new Error('Failed to load data.json');
}
state.records = await response.json();
populateRecordSelect();
enableControls();
setupEventListeners();
} catch (error) {
console.error('Initialization error:', error);
recordSelect.innerHTML = '<option value="">Error loading records</option>';
alert('Failed to load data.json. Please ensure the file exists in the same directory as index.html.');
}
}
function enableControls() {
recordSelect.disabled = false;
}
function populateRecordSelect() {
recordSelect.innerHTML = '<option value="">Select a record...</option>';
state.records.forEach(record => {
const option = document.createElement('option');
option.value = record.id;
option.textContent = `${record.id} - ${record.newspaper_title} ${record.publication_date} Page ${record.page}`;
recordSelect.appendChild(option);
});
}
function setupEventListeners() {
// Record selection
recordSelect.addEventListener('change', (e) => {
const recordId = e.target.value;
if (recordId) {
const record = state.records.find(r => r.id === recordId);
loadRecord(record);
}
});
// OCR source toggle
ocrRadios.forEach(radio => {
radio.addEventListener('change', (e) => {
if (state.hasEdits) {
const confirmed = confirm('You have unsaved edits. Switching OCR source will discard your changes. Continue?');
if (!confirmed) {
// Revert to previous selection
document.querySelector(`input[name="ocr-source"][value="${state.currentOcrSource}"]`).checked = true;
return;
}
}
state.currentOcrSource = e.target.value;
if (state.currentRecord) {
loadOcrText(state.currentRecord);
}
});
});
// Track edits
ocrText.addEventListener('input', () => {
state.hasEdits = ocrText.value !== state.originalOcrText;
});
// Export button
exportBtn.addEventListener('click', exportText);
// Zoom controls
zoomInBtn.addEventListener('click', () => zoomPdf(0.25));
zoomOutBtn.addEventListener('click', () => zoomPdf(-0.25));
zoomResetBtn.addEventListener('click', () => resetZoom());
// Search controls
findNextBtn.addEventListener('click', findNext);
replaceBtn.addEventListener('click', replaceCurrent);
replaceAllBtn.addEventListener('click', replaceAll);
// Allow "Enter" key in find input to trigger Find Next
findInput.addEventListener('keypress', (e) => {
if (e.key === 'Enter') {
findNext();
}
});
// Pan functionality
setupPanControls();
}
// --- Search Functions ---
function findNext() {
const query = findInput.value;
if (!query) return;
const isCaseSensitive = caseSensitiveCb.checked;
const text = ocrText.value;
const startPos = ocrText.selectionEnd; // Start searching from after current cursor/selection
let nextPos = -1;
// Perform search based on case sensitivity
if (isCaseSensitive) {
nextPos = text.indexOf(query, startPos);
if (nextPos === -1) {
nextPos = text.indexOf(query, 0); // Wrap
searchMsg.textContent = nextPos !== -1 ? "Wrapped to top" : "Not found";
} else {
searchMsg.textContent = "";
}
} else {
const lowerText = text.toLowerCase();
const lowerQuery = query.toLowerCase();
nextPos = lowerText.indexOf(lowerQuery, startPos);
if (nextPos === -1) {
nextPos = lowerText.indexOf(lowerQuery, 0); // Wrap
searchMsg.textContent = nextPos !== -1 ? "Wrapped to top" : "Not found";
} else {
searchMsg.textContent = "";
}
}
if (nextPos !== -1) {
// Select the found text
ocrText.focus();
ocrText.setSelectionRange(nextPos, nextPos + query.length);
// Enhanced Scroll to selection logic
// Calculate percentage position of the match
const progress = nextPos / text.length;
// Set scroll top based on percentage of total scroll height
// Offset by half container height to try and center it
const scrollTarget = (ocrText.scrollHeight * progress) - (ocrText.clientHeight / 2);
ocrText.scrollTop = scrollTarget;
// Fallback: Trigger blur/focus which forces browser to scroll to cursor
// This handles edge cases where the calculation might be slightly off due to variable line wrapping
// setTimeout ensures the scroll calculation happens first, then the browser creates final view
setTimeout(() => {
ocrText.blur();
ocrText.focus();
}, 10);
}
}
function replaceCurrent() {
const query = findInput.value;
const replacement = replaceInput.value;
if (!query) return;
const isCaseSensitive = caseSensitiveCb.checked;
// Check if current selection matches the find query
const start = ocrText.selectionStart;
const end = ocrText.selectionEnd;
const selectedText = ocrText.value.substring(start, end);
let match = false;
if (isCaseSensitive) {
match = selectedText === query;
} else {
match = selectedText.toLowerCase() === query.toLowerCase();
}
if (match) {
// Perform replacement
ocrText.setRangeText(replacement, start, end, 'end');
state.hasEdits = true;
searchMsg.textContent = "Replaced";
// Automatically find next
findNext();
} else {
// If text isn't selected or doesn't match, try to find next occurrence first
findNext();
}
}
function replaceAll() {
const query = findInput.value;
const replacement = replaceInput.value;
if (!query) return;
const isCaseSensitive = caseSensitiveCb.checked;
const text = ocrText.value;
// Escape special regex characters to perform a literal "Replace All"
const escapedQuery = query.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const flags = isCaseSensitive ? 'g' : 'gi';
const regex = new RegExp(escapedQuery, flags);
const matchCount = (text.match(regex) || []).length;
if (matchCount > 0) {
const newText = text.replace(regex, replacement);
ocrText.value = newText;
state.hasEdits = true;
searchMsg.textContent = `Replaced ${matchCount} occurrences`;
} else {
searchMsg.textContent = "0 matches found";
}
}
function updateSearchControls(enabled) {
findInput.disabled = !enabled;
replaceInput.disabled = !enabled;
findNextBtn.disabled = !enabled;
replaceBtn.disabled = !enabled;
replaceAllBtn.disabled = !enabled;
caseSensitiveCb.disabled = !enabled;
if (!enabled) {
findInput.value = '';
replaceInput.value = '';
caseSensitiveCb.checked = false;
searchMsg.textContent = '';
}
}
// --- End Search Functions ---
async function loadRecord(record) {
state.currentRecord = record;
state.hasEdits = false;
// Update metadata
metadata.classList.add('active');
document.getElementById('meta-title').textContent = record.newspaper_title;
document.getElementById('meta-date').textContent = record.publication_date;
document.getElementById('meta-page').textContent = record.page;
document.getElementById('meta-link').href = record.loc_record_url;
// Load PDF and OCR text in parallel
await Promise.all([
loadPdf(record.pdf_path),
loadOcrText(record)
]);
}
async function loadPdf(pdfPath) {
pdfContainer.innerHTML = '<div class="loading"><div class="loading-spinner"></div><div>Loading PDF...</div></div>';
try {
const loadingTask = pdfjsLib.getDocument(pdfPath);
const pdf = await loadingTask.promise;
state.currentPdf = pdf;
state.zoomLevel = 1.0;
await renderPdf();
// Enable zoom controls
zoomInBtn.disabled = false;
zoomOutBtn.disabled = false;
zoomResetBtn.disabled = false;
// Enable pan controls
pdfContainer.classList.add('grabbable');
} catch (error) {
console.error('PDF loading error:', error);
pdfContainer.innerHTML = '<div class="error-message">Failed to load PDF: ' + error.message + '</div>';
}
}
async function renderPdf() {
if (!state.currentPdf) return;
pdfContainer.innerHTML = '<div class="loading"><div class="loading-spinner"></div><div>Rendering PDF...</div></div>';
const page = await state.currentPdf.getPage(1);
const baseScale = 1.5;
const scale = baseScale * state.zoomLevel;
const viewport = page.getViewport({ scale });
const canvas = document.createElement('canvas');
canvas.id = 'pdf-canvas';
const context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
const renderContext = {
canvasContext: context,
viewport: viewport
};
await page.render(renderContext).promise;
pdfContainer.innerHTML = '';
pdfContainer.appendChild(canvas);
// Update zoom level display
zoomLevel.textContent = Math.round(state.zoomLevel * 100) + '%';
}
async function loadOcrText(record) {
ocrText.value = '';
ocrText.disabled = true;
exportBtn.disabled = true;
updateSearchControls(false); // Disable search controls while loading
state.hasEdits = false;
const loadingDiv = document.createElement('div');
loadingDiv.className = 'loading';
loadingDiv.innerHTML = '<div class="loading-spinner"></div><div>Loading OCR text...</div>';
ocrText.parentElement.appendChild(loadingDiv);
try {
let text = '';
if (state.currentOcrSource === 'textract') {
text = await fetchTextractOcr(record.textract_ocr_file);
} else {
text = await fetchLocOcr(record.loc_altoxml_path);
}
state.originalOcrText = text;
ocrText.value = text;
ocrText.disabled = false;
exportBtn.disabled = false;
updateSearchControls(true); // Enable search controls
loadingDiv.remove();
} catch (error) {
console.error('OCR loading error:', error);
loadingDiv.innerHTML = '<div class="error-message">Failed to load OCR text: ' + error.message + '</div>';
setTimeout(() => loadingDiv.remove(), 3000);
}
}
async function fetchTextractOcr(filePath) {
const response = await fetch(filePath);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
return await response.text();
}
async function fetchLocOcr(xmlPath) {
const response = await fetch(xmlPath);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const xmlText = await response.text();
return parseAltoXml(xmlText);
}
function parseAltoXml(xmlText) {
const parser = new DOMParser();
const xmlDoc = parser.parseFromString(xmlText, 'text/xml');
// Check for parsing errors
const parserError = xmlDoc.querySelector('parsererror');
if (parserError) {
throw new Error('XML parsing error');
}
const strings = xmlDoc.getElementsByTagName('String');
const textLines = [];
let currentLine = [];
let lastVPos = null;
for (let i = 0; i < strings.length; i++) {
const stringElement = strings[i];
const content = stringElement.getAttribute('CONTENT');
const vpos = stringElement.getAttribute('VPOS');
if (content) {
// Check if we're on a new line (VPOS changed significantly)
if (lastVPos !== null && vpos !== null) {
const vposDiff = Math.abs(parseInt(vpos) - parseInt(lastVPos));
if (vposDiff > 50) { // Threshold for new line
if (currentLine.length > 0) {
textLines.push(currentLine.join(' '));
currentLine = [];
}
}
}
currentLine.push(content);
lastVPos = vpos;
}
}
// Add the last line
if (currentLine.length > 0) {
textLines.push(currentLine.join(' '));
}
return textLines.join('\n');
}
function exportText() {
if (!state.currentRecord) return;
const text = ocrText.value;
const record = state.currentRecord;
// Sanitize filename
const sanitizedTitle = record.newspaper_title
.replace(/[^a-z0-9]/gi, '_')
.replace(/_+/g, '_')
.toLowerCase();
const filename = `${record.id}_${sanitizedTitle}_${record.publication_date}.txt`;
// Create blob and download
const blob = new Blob([text], { type: 'text/plain' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = filename;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
function zoomPdf(delta) {
// Store current scroll position as percentage
const scrollLeftPercent = pdfContainer.scrollLeft / (pdfContainer.scrollWidth - pdfContainer.clientWidth);
const scrollTopPercent = pdfContainer.scrollTop / (pdfContainer.scrollHeight - pdfContainer.clientHeight);
// Update zoom level (limit between 0.5x and 5x)
state.zoomLevel = Math.max(0.5, Math.min(5, state.zoomLevel + delta));
// Re-render PDF
renderPdf().then(() => {
// Restore scroll position
pdfContainer.scrollLeft = scrollLeftPercent * (pdfContainer.scrollWidth - pdfContainer.clientWidth);
pdfContainer.scrollTop = scrollTopPercent * (pdfContainer.scrollHeight - pdfContainer.clientHeight);
});
}
function resetZoom() {
state.zoomLevel = 1.0;
renderPdf();
}
function setupPanControls() {
let isMouseDown = false;
let startX, startY, scrollLeft, scrollTop;
pdfContainer.addEventListener('mousedown', (e) => {
// Only pan if we clicked on the container or canvas
if (e.target === pdfContainer || e.target.id === 'pdf-canvas') {
isMouseDown = true;
pdfContainer.classList.remove('grabbable');
pdfContainer.classList.add('grabbing');
startX = e.pageX - pdfContainer.offsetLeft;
startY = e.pageY - pdfContainer.offsetTop;
scrollLeft = pdfContainer.scrollLeft;
scrollTop = pdfContainer.scrollTop;
}
});
pdfContainer.addEventListener('mouseleave', () => {
if (isMouseDown) {
isMouseDown = false;
pdfContainer.classList.remove('grabbing');
pdfContainer.classList.add('grabbable');
}
});
pdfContainer.addEventListener('mouseup', () => {
if (isMouseDown) {
isMouseDown = false;
pdfContainer.classList.remove('grabbing');
pdfContainer.classList.add('grabbable');
}
});
pdfContainer.addEventListener('mousemove', (e) => {
if (!isMouseDown) return;
e.preventDefault();
const x = e.pageX - pdfContainer.offsetLeft;
const y = e.pageY - pdfContainer.offsetTop;
const walkX = (x - startX) * 1.5; // Multiply for faster scrolling
const walkY = (y - startY) * 1.5;
pdfContainer.scrollLeft = scrollLeft - walkX;
pdfContainer.scrollTop = scrollTop - walkY;
});
}
// Initialize on page load
init();
</script>
</body>
</html>