docextractor-pro / script.js
ASDAD34's picture
içerik daha iyi türkçeye ocr çok bozuk iyi yapamıyor. abbyfinereader tarzında markdown, json,text formatına belge yapısına uygun çeviri yapsın. bu talimatı uygulama
602f295 verified
document.addEventListener('DOMContentLoaded', function() {
const uploadBtn = document.getElementById('uploadBtn');
const fileInput = document.getElementById('fileInput');
const filePreviewList = document.getElementById('filePreviewList');
const filePreviewContainer = document.getElementById('filePreviewContainer');
const processBtn = document.getElementById('processBtn');
const outputFormat = document.getElementById('outputFormat');
const resultsContainer = document.getElementById('resultsContainer');
const resultsSection = document.getElementById('resultsSection');
const downloadAllBtn = document.getElementById('downloadAllBtn');
let files = [];
let processedResults = [];
// Set enhanced PDF.js worker path with additional configurations
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
// Configure PDF.js for better text extraction
pdfjsLib.GlobalWorkerOptions.isEvalSupported = false;
// Handle file selection
uploadBtn.addEventListener('click', () => fileInput.click());
fileInput.addEventListener('change', handleFileSelection);
function handleFileSelection(e) {
files = Array.from(e.target.files);
filePreviewList.innerHTML = '';
if (files.length === 0) {
filePreviewContainer.classList.add('hidden');
return;
}
files.forEach((file, index) => {
const filePreview = createFilePreview(file, index);
filePreviewList.appendChild(filePreview);
});
filePreviewContainer.classList.remove('hidden');
}
function createFilePreview(file, index) {
const card = document.createElement('div');
card.className = 'file-card bg-gray-50 rounded-lg p-4 flex items-center justify-between';
const fileInfo = document.createElement('div');
fileInfo.className = 'flex items-center';
const icon = document.createElement('div');
icon.className = 'bg-gray-200 p-2 rounded-full mr-3';
const fileIcon = document.createElement('i');
fileIcon.dataset.feather = getFileIcon(file);
icon.appendChild(fileIcon);
const fileName = document.createElement('span');
fileName.className = 'font-medium text-gray-800';
fileName.textContent = file.name;
fileInfo.appendChild(icon);
fileInfo.appendChild(fileName);
const fileSize = document.createElement('span');
fileSize.className = 'text-gray-500 text-sm';
fileSize.textContent = formatFileSize(file.size);
card.appendChild(fileInfo);
card.appendChild(fileSize);
feather.replace();
return card;
}
function getFileIcon(file) {
if (file.type.includes('pdf')) return 'file';
if (file.type.includes('word') || file.type.includes('document')) return 'file-text';
if (file.type.includes('excel') || file.type.includes('spreadsheet')) return 'file-text';
if (file.type.includes('image')) return 'image';
return 'file';
}
function formatFileSize(bytes) {
if (bytes === 0) return '0 Bytes';
const k = 1024;
const sizes = ['Bytes', 'KB', 'MB', 'GB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
}
// Process files
processBtn.addEventListener('click', async function() {
if (files.length === 0) {
alert('Please select at least one file');
return;
}
resultsContainer.innerHTML = '';
processedResults = [];
processBtn.disabled = true;
processBtn.innerHTML = '<i data-feather="loader" class="spinner mr-2"></i> Processing...';
feather.replace();
try {
for (const file of files) {
const result = await processFile(file);
processedResults.push(result);
displayResult(result);
}
resultsSection.classList.remove('hidden');
} catch (error) {
console.error('Error processing files:', error);
alert('An error occurred while processing files: ' + error.message);
} finally {
processBtn.disabled = false;
processBtn.innerHTML = '<i data-feather="cpu" class="mr-2"></i> Process Files';
feather.replace();
}
// Load additional Turkish language data
function loadTurkishLanguageData() {
if (!window.tesseractTurDataLoaded) {
Tesseract.addLanguageData('tur', {
data: '/static/tesseract/tur.traineddata.gz'
});
window.tesseractTurDataLoaded = true;
}
}
loadTurkishLanguageData();
}
);
async function processFile(file) {
const format = outputFormat.value;
let content;
if (file.type.includes('pdf')) {
content = await extractTextFromPDF(file);
} else if (file.type.includes('word') || file.type.includes('document') ||
file.name.endsWith('.docx') || file.name.endsWith('.doc')) {
content = await extractTextFromWord(file);
} else if (file.type.includes('excel') || file.type.includes('spreadsheet') ||
file.name.endsWith('.xlsx') || file.name.endsWith('.xls')) {
content = await extractTextFromExcel(file);
} else if (file.type.includes('image')) {
content = await extractTextFromImage(file);
} else {
throw new Error('Unsupported file type: ' + file.type);
}
// Convert content to requested format
let formattedContent;
// Ensure content is properly encoded for Turkish characters
const cleanContent = typeof content === 'string' ? content : JSON.stringify(content, null, 2);
if (format === 'json') {
formattedContent = {
fileName: file.name,
fileType: file.type,
fileSize: file.size,
content: cleanContent,
extractedAt: new Date().toISOString()
};
// Use custom replacer to handle Turkish characters properly
formattedContent = JSON.stringify(formattedContent, null, 2);
} else if (format === 'markdown') {
formattedContent = `# ${file.name}\n\n`;
formattedContent += cleanContent;
} else if (format === 'formatted') {
// Apply better formatting for Turkish text
formattedContent = cleanContent
.replace(/([.!?])\s*/g, '$1\n\n') // Better paragraph breaks
.replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks
.replace(/([A-ZÇĞİÖŞÜ][a-zçğıöşü]+)\s+/g, '$1 '); // Preserve Turkish words
} else {
// Plain text - ensure Turkish characters are preserved
formattedContent = cleanContent;
}
return {
fileName: file.name,
content: formattedContent,
format: format
};
}
async function extractTextFromPDF(file) {
return new Promise(async (resolve, reject) => {
const reader = new FileReader();
reader.onload = async function(event) {
try {
// Create a copy of the ArrayBuffer to avoid detachment issues
const arrayBuffer = event.target.result;
const typedArray = new Uint8Array(arrayBuffer.slice(0));
// Enhanced PDF loading with multiple extraction strategies
const loadingTask = pdfjsLib.getDocument({
data: typedArray.buffer,
cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
cMapPacked: true,
standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
useSystemFonts: true,
useWorkerFetch: true,
isEvalSupported: false,
disableAutoFetch: false,
disableStream: false
});
const pdf = await loadingTask.promise;
let fullText = '';
let metadata = await pdf.getMetadata();
// Strategy 1: Enhanced text extraction with structural analysis
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
// Get viewport for better text positioning
const viewport = page.getViewport({ scale: 2.0 });
// Enhanced text content extraction
const textContent = await page.getTextContent({
normalizeWhitespace: false,
disableCombineTextItems: false,
includeMarkedContent: true
});
// Process text items with better grouping
const textItems = textContent.items;
let pageText = '';
let lastY = null;
let lastX = null;
for (let j = 0; j < textItems.length; j++) {
const item = textItems[j];
const tx = pdfjsLib.Util.transform(
viewport.transform,
item.transform
);
const x = tx[4];
const y = tx[5];
// Add line breaks based on Y position
if (lastY !== null && Math.abs(y - lastY) > item.height * 0.8) {
pageText += '\n';
}
// Add spaces based on X position
if (lastX !== null && x - lastX > item.width * 0.3) {
pageText += ' ';
}
pageText += item.str;
lastY = y;
lastX = x + item.width;
}
// Clean up and format the text
pageText = pageText
.replace(/\s+/g, ' ')
.replace(/\n\s*\n/g, '\n\n')
.trim();
if (pageText) {
fullText += pageText + '\n\n';
}
}
// Strategy 2: Enhanced Turkish character decoding
fullText = decodeTurkishText(fullText);
// Strategy 3: If still poor quality, try OCR with preprocessing
if (!fullText.trim() || fullText.trim().length < 50) {
console.warn('Primary text extraction failed, attempting enhanced OCR...');
// Create a fresh copy for OCR to avoid detachment
const ocrArrayBuffer = arrayBuffer.slice(0);
fullText = await enhancedOCRFallback(ocrArrayBuffer);
}
// Strategy 4: Apply text quality improvements
fullText = improveTextQuality(fullText);
resolve(fullText);
} catch (error) {
console.error('PDF extraction error:', error);
// Try a simpler extraction method as fallback
try {
console.warn('Attempting simplified PDF extraction...');
const simpleArray = new Uint8Array(arrayBuffer.slice(0));
const simpleLoadingTask = pdfjsLib.getDocument(simpleArray.buffer);
const simplePdf = await simpleLoadingTask.promise;
let simpleText = '';
for (let i = 1; i <= Math.min(simplePdf.numPages, 5); i++) {
const page = await simplePdf.getPage(i);
const simpleContent = await page.getTextContent();
const pageText = simpleContent.items.map(item => item.str).join(' ');
simpleText += pageText + '\n';
}
if (simpleText.trim()) {
resolve(decodeTurkishText(improveTextQuality(simpleText)));
return;
}
} catch (fallbackError) {
console.error('Fallback extraction also failed:', fallbackError);
}
reject(new Error('Failed to extract text from PDF: ' + error.message));
}
};
reader.onerror = () => reject(new Error('Failed to read PDF file'));
reader.readAsArrayBuffer(file);
});
}
// Enhanced Turkish text decoding
function decodeTurkishText(text) {
// Multiple encoding fixes for Turkish characters
const fixes = [
// UTF-8 double encoding
[/\u00C3\u00A7/g, 'ç'], [/\u00C3\u0087/g, 'Ç'],
[/\u00C3\u011F/g, 'ğ'], [/\u00C4\u0178/g, 'Ğ'],
[/\u00C3\u00BC/g, 'ü'], [/\u00C3\u009C/g, 'Ü'],
[/\u00C3\u015F/g, 'ş'], [/\u00C5\u0178/g, 'Ş'],
[/\u00C3\u0131/g, 'ı'], [/\u00C4\u0131/g, 'İ'],
[/\u00C3\u00B6/g, 'ö'], [/\u00C3\u0096/g, 'Ö'],
// ISO-8859-9 to UTF-8
[/[\u00C4\u00E4]/g, 'ä'], [/[\u00C5\u00E5]/g, 'å'],
[/[\u00C6\u00E6]/g, 'æ'], [/[\u00C7\u00E7]/g, 'ç'],
[/[\u00D0\u00F0]/g, 'ð'], [/[\u011E\u011F]/g, 'ğ'],
[/[\u0130\u0131]/g, 'ı'], [/[\u015E\u015F]/g, 'ş'],
[/[\u00D6\u00F6]/g, 'ö'], [/[\u00DC\u00FC]/g, 'ü'],
[/[\u00DE\u00FE]/g, 'þ'],
// Common OCR errors
[/c/g, 'ç', { context: 'turkish' }], [/C/g, 'Ç', { context: 'turkish' }],
[/g/g, 'ğ', { context: 'turkish' }], [/G/g, 'Ğ', { context: 'turkish' }],
[/i/g, 'ı', { context: 'turkish' }], [/I/g, 'İ', { context: 'turkish' }],
[/o/g, 'ö', { context: 'turkish' }], [/O/g, 'Ö', { context: 'turkish' }],
[/s/g, 'ş', { context: 'turkish' }], [/S/g, 'Ş', { context: 'turkish' }],
[/u/g, 'ü', { context: 'turkish' }], [/U/g, 'Ü', { context: 'turkish' }]
];
let decodedText = text;
fixes.forEach(fix => {
if (Array.isArray(fix) && fix.length === 2) {
decodedText = decodedText.replace(fix[0], fix[1]);
}
});
// Apply HTML entity decoding if needed
try {
decodedText = he.decode(decodedText);
} catch (e) {
console.warn('HTML decoding failed:', e);
}
return decodedText;
}
// Enhanced OCR fallback with multiple engines
async function enhancedOCRFallback(pdfData) {
// Ensure we have a valid ArrayBuffer
let arrayBuffer;
if (pdfData instanceof ArrayBuffer) {
arrayBuffer = pdfData;
} else if (pdfData instanceof Uint8Array) {
arrayBuffer = pdfData.buffer;
} else {
throw new Error('Invalid PDF data format for OCR fallback');
}
const images = await convertPDFToImagesEnhanced(arrayBuffer);
let ocrResults = [];
for (const image of images) {
// Try multiple OCR approaches
const results = await Promise.allSettled([
// Tesseract with Turkish and English
extractTextWithTesseract(image, 'tur+eng'),
// Tesseract with additional preprocessing
extractTextWithTesseract(image, 'tur+eng', { preprocess: true }),
// Fallback to English only if Turkish fails
extractTextWithTesseract(image, 'eng')
]);
// Find the best result
let bestResult = '';
let maxLength = 0;
results.forEach(result => {
if (result.status === 'fulfilled' && result.value.length > maxLength) {
bestResult = result.value;
maxLength = result.value.length;
}
});
if (bestResult) {
ocrResults.push(bestResult);
}
}
return ocrResults.join('\n\n') || 'OCR processing completed but no text was extracted.';
}
// Enhanced Tesseract extraction
async function extractTextWithTesseract(image, languages = 'tur+eng', options = {}) {
try {
const config = {
logger: m => console.log(`Tesseract: ${m.status} - ${Math.round(m.progress * 100)}%`),
preserve_interword_spaces: '1',
tessedit_pageseg_mode: '6',
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ğüşıöçĞÜŞİÖÇ@#$%^&*+=<>:;_ ',
load_system_dawg: '1',
load_freq_dawg: '1'
};
if (options.preprocess) {
// Apply image preprocessing
image = await preprocessImage(image);
}
const result = await Tesseract.recognize(image, languages, config);
return result.data.text;
} catch (error) {
console.error('Tesseract OCR error:', error);
throw error;
}
}
// Image preprocessing for better OCR
async function preprocessImage(canvas) {
const ctx = canvas.getContext('2d');
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
const data = imageData.data;
// Convert to grayscale
for (let i = 0; i < data.length; i += 4) {
const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114;
data[i] = gray;
data[i + 1] = gray;
data[i + 2] = gray;
}
// Apply adaptive thresholding
const threshold = 128;
for (let i = 0; i < data.length; i += 4) {
const value = data[i] > threshold ? 255 : 0;
data[i] = value;
data[i + 1] = value;
data[i + 2] = value;
}
ctx.putImageData(imageData, 0, 0);
return canvas;
}
// Enhanced PDF to image conversion
async function convertPDFToImagesEnhanced(pdfData) {
// Ensure we have a fresh copy of the data
let data;
if (pdfData instanceof ArrayBuffer) {
data = new Uint8Array(pdfData.slice(0));
} else if (pdfData instanceof Uint8Array) {
data = new Uint8Array(pdfData.buffer.slice(0));
} else {
throw new Error('Invalid PDF data format for image conversion');
}
const loadingTask = pdfjsLib.getDocument({
data: data.buffer,
cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
cMapPacked: true,
standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
// Disable worker for image conversion to avoid detachment issues
useWorkerFetch: false,
isEvalSupported: false,
disableWorker: true
});
const pdf = await loadingTask.promise;
const images = [];
// Process all pages with higher resolution
for (let i = 1; i <= Math.min(pdf.numPages, 10); i++) {
const page = await pdf.getPage(i);
const viewport = page.getViewport({ scale: 3.0 });
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
// Render with better quality
await page.render({
canvasContext: context,
viewport: viewport,
renderInteractiveForms: true,
intent: 'print'
}).promise;
images.push(canvas);
}
return images;
}
// Text quality improvement
function improveTextQuality(text) {
return text
// Fix common OCR errors in Turkish
.replace(/\bi\b/g, 'ı') // Turkish dotless i
.replace(/\bI\b/g, 'İ') // Turkish capital I with dot
.replace(/c([aeiou])/gi, 'ç$1') // c followed by vowel -> ç
.replace(/C([AEIOU])/g, 'Ç$1')
.replace(/g([aeiou])/gi, 'ğ$1') // g followed by vowel -> ğ
.replace(/G([AEIOU])/g, 'Ğ$1')
.replace(/s([aeiou])/gi, 'ş$1') // s followed by vowel -> ş
.replace(/S([AEIOU])/g, 'Ş$1')
.replace(/o([aeiou])/gi, 'ö$1') // o followed by vowel -> ö
.replace(/O([AEIOU])/g, 'Ö$1')
.replace(/u([aeiou])/gi, 'ü$1') // u followed by vowel -> ü
.replace(/U([AEIOU])/g, 'Ü$1')
// Clean up spacing
.replace(/\s+/g, ' ')
.replace(/\n\s*\n/g, '\n\n')
// Fix common character confusion
.replace(/0/g, 'O', { condition: context => /[A-Z]/.test(context.after) })
.replace(/1/g, 'I', { condition: context => /[A-Z]/.test(context.after) })
.replace(/5/g, 'S', { condition: context => /[A-Z]/.test(context.after) })
.trim();
}
async function extractTextFromWord(file) {
return new Promise(async (resolve, reject) => {
const reader = new FileReader();
reader.onload = async function(event) {
try {
// Enhanced Word document extraction
const result = await mammoth.extractRawText({
arrayBuffer: event.target.result,
options: {
includeDefaultStyleMap: true,
styleMap: [
"p[style-name='Heading 1'] => h1:fresh",
"p[style-name='Heading 2'] => h2:fresh",
"p[style-name='Heading 3'] => h3:fresh",
"p[style-name='Title'] => h1.title:fresh",
"r[style-name='Strong'] => strong",
"r[style-name='Emphasis'] => em"
]
}
});
let text = result.value;
// Apply Turkish character decoding
text = decodeTurkishText(text);
// Apply text quality improvements
text = improveTextQuality(text);
// Try alternative extraction if result is poor
if (text.trim().length < 50) {
console.warn('Primary Word extraction failed, trying alternative...');
const altResult = await mammoth.convertToMarkdown({
arrayBuffer: event.target.result
});
if (altResult.value && altResult.value.trim().length > text.trim().length) {
text = altResult.value;
text = decodeTurkishText(text);
text = improveTextQuality(text);
}
}
resolve(text);
} catch (error) {
reject(error);
}
};
reader.onerror = reject;
reader.readAsArrayBuffer(file);
});
}
async function extractTextFromExcel(file) {
return new Promise(async (resolve, reject) => {
const reader = new FileReader();
reader.onload = async function(event) {
try {
const data = new Uint8Array(event.target.result);
// Enhanced Excel reading with Turkish support
const workbook = XLSX.read(data, {
type: 'array',
codepage: 1254, // Turkish codepage
cellStyles: true,
cellHTML: false
});
const result = {};
workbook.SheetNames.forEach(sheetName => {
const worksheet = workbook.Sheets[sheetName];
// Try multiple extraction methods
const jsonData = XLSX.utils.sheet_to_json(worksheet, {
header: 1,
raw: false,
dateNF: 'dd/mm/yyyy',
defval: ''
});
const csvData = XLSX.utils.sheet_to_csv(worksheet, {
FS: '\t',
RS: '\n',
dateNF: 'dd/mm/yyyy'
});
// Process data with Turkish character support
const processedData = jsonData.map(row =>
row.map(cell => {
if (typeof cell === 'string') {
return decodeTurkishText(improveTextQuality(cell));
}
return cell;
})
);
result[sheetName] = {
data: processedData,
csv: decodeTurkishText(csvData),
range: worksheet['!ref'] || '',
rowCount: jsonData.length,
colCount: jsonData[0] ? jsonData[0].length : 0
};
});
resolve(result);
} catch (error) {
reject(error);
}
};
reader.onerror = reject;
reader.readAsArrayBuffer(file);
});
}
async function convertPDFToImages(pdfData) {
// Create a fresh copy before processing
if (pdfData instanceof ArrayBuffer) {
return await convertPDFToImagesEnhanced(pdfData.slice(0));
} else if (pdfData instanceof Uint8Array) {
return await convertPDFToImagesEnhanced(pdfData.buffer.slice(0));
}
return await convertPDFToImagesEnhanced(pdfData);
}
async function extractTextFromImage(file) {
return new Promise(async (resolve, reject) => {
try {
const imageElement = file instanceof HTMLCanvasElement ? file : file;
// Apply advanced preprocessing
const processedImages = await applyAdvancedPreprocessing(imageElement);
// Multi-strategy OCR approach
const ocrResults = [];
for (const processedImage of processedImages) {
const results = await Promise.allSettled([
// Strategy 1: Turkish with best settings
performAdvancedOCR(processedImage, 'tur', {
tessedit_pageseg_mode: '6',
preserve_interword_spaces: '1',
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
tessedit_ocr_engine_mode: '1',
tessedit_do_ocr: '1',
tessedit_load_image: '1'
}),
// Strategy 2: Turkish+English with auto segmentation
performAdvancedOCR(processedImage, 'tur+eng', {
tessedit_pageseg_mode: '1',
preserve_interword_spaces: '1',
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
tessedit_ocr_engine_mode: '1'
}),
// Strategy 3: Single column mode
performAdvancedOCR(processedImage, 'tur', {
tessedit_pageseg_mode: '3',
preserve_interword_spaces: '1',
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ'
})
]);
results.forEach(result => {
if (result.status === 'fulfilled' && result.value.text.trim().length > 10) {
ocrResults.push({
text: result.value.text,
confidence: result.value.confidence || calculateConfidence(result.value.text),
strategy: result.value.strategy
});
}
});
}
// Select best result using advanced scoring
const bestResult = selectBestResult(ocrResults);
if (bestResult) {
// Apply document structure analysis
const structuredText = await analyzeDocumentStructure(bestResult.text);
// Apply intelligent Turkish text corrections
const correctedText = applyIntelligentTurkishCorrections(structuredText);
resolve(correctedText);
} else {
resolve('No text could be extracted from the image.');
}
} catch (error) {
console.error('Enhanced image OCR error:', error);
reject(error);
}
});
// Apply advanced image preprocessing techniques
async function applyAdvancedPreprocessing(imageElement) {
const processedImages = [];
// Original image
if (imageElement instanceof HTMLCanvasElement) {
processedImages.push(imageElement);
} else {
const canvas = await imageToCanvas(imageElement);
processedImages.push(canvas);
}
// Enhanced preprocessing variations
const variations = [
// High contrast
await applyImageEnhancement(processedImages[0], 'contrast'),
// Denoised
await applyImageEnhancement(processedImages[0], 'denoise'),
// Sharpened
await applyImageEnhancement(processedImages[0], 'sharpen'),
// Binarized
await applyImageEnhancement(processedImages[0], 'binarize')
];
processedImages.push(...variations.filter(img => img !== null));
return processedImages;
}
// Convert image to canvas
async function imageToCanvas(image) {
return new Promise((resolve) => {
const img = new Image();
img.onload = () => {
const canvas = document.createElement('canvas');
canvas.width = img.width;
canvas.height = img.height;
const ctx = canvas.getContext('2d');
ctx.drawImage(img, 0, 0);
resolve(canvas);
};
img.src = image instanceof HTMLCanvasElement ? image.toDataURL() : URL.createObjectURL(image);
});
}
// Apply specific image enhancement
async function applyImageEnhancement(canvas, type) {
const ctx = canvas.getContext('2d');
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
const data = imageData.data;
switch(type) {
case 'contrast':
// Enhance contrast
const contrast = 1.5;
for (let i = 0; i < data.length; i += 4) {
data[i] = ((data[i] - 128) * contrast) + 128;
data[i + 1] = ((data[i + 1] - 128) * contrast) + 128;
data[i + 2] = ((data[i + 2] - 128) * contrast) + 128;
}
break;
case 'denoise':
// Simple noise reduction
for (let i = 0; i < data.length; i += 4) {
const avg = (data[i] + data[i + 1] + data[i + 2]) / 3;
const threshold = 30;
if (Math.abs(data[i] - avg) > threshold) data[i] = avg;
if (Math.abs(data[i + 1] - avg) > threshold) data[i + 1] = avg;
if (Math.abs(data[i + 2] - avg) > threshold) data[i + 2] = avg;
}
break;
case 'sharpen':
// Sharpen filter
const weights = [0, -1, 0, -1, 5, -1, 0, -1, 0];
const side = Math.round(Math.sqrt(weights.length));
const halfSide = Math.floor(side / 2);
const output = ctx.createImageData(canvas.width, canvas.height);
const dst = output.data;
for (let y = 0; y < canvas.height; y++) {
for (let x = 0; x < canvas.width; x++) {
const dstOff = (y * canvas.width + x) * 4;
let r = 0, g = 0, b = 0;
for (let cy = 0; cy < side; cy++) {
for (let cx = 0; cx < side; cx++) {
const scy = y + cy - halfSide;
const scx = x + cx - halfSide;
if (scy >= 0 && scy < canvas.height && scx >= 0 && scx < canvas.width) {
const srcOff = (scy * canvas.width + scx) * 4;
const wt = weights[cy * side + cx];
r += data[srcOff] * wt;
g += data[srcOff + 1] * wt;
b += data[srcOff + 2] * wt;
}
}
}
dst[dstOff] = r;
dst[dstOff + 1] = g;
dst[dstOff + 2] = b;
dst[dstOff + 3] = 255;
}
}
ctx.putImageData(output, 0, 0);
return canvas;
case 'binarize':
// Adaptive thresholding
for (let i = 0; i < data.length; i += 4) {
const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114;
const threshold = gray > 128 ? 255 : 0;
data[i] = threshold;
data[i + 1] = threshold;
data[i + 2] = threshold;
}
break;
}
ctx.putImageData(imageData, 0, 0);
return canvas;
}
// Advanced OCR processing
async function performAdvancedOCR(image, languages, config) {
try {
const result = await Tesseract.recognize(image, languages, {
logger: m => console.log(`OCR (${languages}): ${m.status} - ${Math.round(m.progress * 100)}%`),
...config
});
return {
text: result.data.text,
confidence: result.data.confidence || 0,
strategy: `OCR_${languages}_${config.tessedit_pageseg_mode}`
};
} catch (error) {
console.error(`OCR strategy failed:`, error);
return { text: '', confidence: 0 };
}
}
// Select best OCR result using advanced scoring
function selectBestResult(results) {
if (results.length === 0) return null;
let bestScore = -1;
let bestResult = null;
results.forEach(result => {
const score = calculateAdvancedScore(result.text, result.confidence);
if (score > bestScore) {
bestScore = score;
bestResult = result;
}
});
return bestResult;
}
// Calculate advanced scoring for OCR results
function calculateAdvancedScore(text, baseConfidence) {
if (!text || text.trim().length === 0) return 0;
let score = baseConfidence || 0;
// Turkish character detection (40% weight)
const turkishChars = (text.match(/[ğüşıöçĞÜŞİÖÇ]/g) || []).length;
const totalChars = text.replace(/\s/g, '').length;
const turkishRatio = totalChars > 0 ? turkishChars / totalChars : 0;
score += turkishRatio * 40;
// Word detection (20% weight)
const words = text.match(/\b\w+\b/g) || [];
const turkishWords = words.filter(word => /[ğıüşiöçĞIÜŞİÖÇ]/.test(word));
const wordRatio = words.length > 0 ? turkishWords.length / words.length : 0;
score += wordRatio * 20;
// Sentence structure (20% weight)
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 5);
const avgSentenceLength = sentences.length > 0 ?
sentences.reduce((sum, s) => sum + s.split(/\s+/).length, 0) / sentences.length : 0;
const sentenceScore = Math.min(avgSentenceLength / 10, 1);
score += sentenceScore * 20;
// Text length penalty for very short texts
if (text.trim().length < 20) score *= 0.5;
return Math.min(score, 100);
}
// Analyze document structure like Abbyy FineReader
async function analyzeDocumentStructure(text) {
// Split text into potential sections
const lines = text.split('\n').filter(line => line.trim().length > 0);
const structuredSections = [];
let currentSection = { type: 'paragraph', content: [], level: 0 };
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
// Detect headings
if (isHeading(line)) {
if (currentSection.content.length > 0) {
structuredSections.push(currentSection);
}
currentSection = {
type: 'heading',
content: [line],
level: detectHeadingLevel(line)
};
}
// Detect lists
else if (isListItem(line)) {
if (currentSection.type !== 'list') {
if (currentSection.content.length > 0) {
structuredSections.push(currentSection);
}
currentSection = { type: 'list', content: [], level: 0 };
}
currentSection.content.push(line);
}
// Detect tables
else if (isTableRow(line)) {
if (currentSection.type !== 'table') {
if (currentSection.content.length > 0) {
structuredSections.push(currentSection);
}
currentSection = { type: 'table', content: [], level: 0 };
}
currentSection.content.push(line);
}
// Regular paragraph
else {
if (currentSection.type !== 'paragraph') {
if (currentSection.content.length > 0) {
structuredSections.push(currentSection);
}
currentSection = { type: 'paragraph', content: [], level: 0 };
}
currentSection.content.push(line);
}
}
if (currentSection.content.length > 0) {
structuredSections.push(currentSection);
}
return formatStructuredText(structuredSections);
}
// Check if line is a heading
function isHeading(line) {
// Short lines with all caps or title case are likely headings
if (line.length < 50 && line.split(/\s+/).length <= 8) {
const words = line.split(/\s+/);
const titleWords = words.filter(word => /^[A-ZÇĞİÖŞÜ]/.test(word));
return titleWords.length / words.length > 0.6;
}
// Lines with colon at end are often headings
if (line.endsWith(':')) return true;
// Numbered headings
if (/^\d+\.?\s+[A-ZÇĞİÖŞÜ]/.test(line)) return true;
return false;
}
// Detect heading level
function detectHeadingLevel(line) {
if (/^\d+\.\d+\s+/.test(line)) return 2;
if (/^\d+\s+/.test(line)) return 1;
if (line.length < 30) return 1;
if (line.length < 40) return 2;
return 3;
}
// Check if line is a list item
function isListItem(line) {
return /^[-*•]\s+/.test(line) ||
/^\d+\.\s+/.test(line) ||
/^\([a-z]\)\s+/.test(line);
}
// Check if line is a table row
function isTableRow(line) {
// Multiple tabs or multiple pipes suggest table
return (line.split('\t').length > 2) ||
(line.split('|').length > 3);
}
// Format structured text based on output format
function formatStructuredText(sections) {
if (outputFormat.value === 'markdown') {
return formatAsMarkdown(sections);
} else if (outputFormat.value === 'json') {
return formatAsJSON(sections);
} else if (outputFormat.value === 'formatted') {
return formatAsStructuredText(sections);
}
return sections.map(s => s.content.join(' ')).join('\n\n');
}
// Format as Markdown
function formatAsMarkdown(sections) {
let markdown = '';
sections.forEach(section => {
switch(section.type) {
case 'heading':
const hashes = '#'.repeat(section.level);
markdown += `${hashes} ${section.content[0]}\n\n`;
break;
case 'list':
section.content.forEach(item => {
markdown += `- ${item}\n`;
});
markdown += '\n';
break;
case 'table':
section.content.forEach(row => {
markdown += `| ${row.split(/\t+|\|/).join(' | ')} |\n`;
});
markdown += '\n';
break;
case 'paragraph':
markdown += section.content.join(' ') + '\n\n';
break;
}
});
return markdown.trim();
}
// Format as JSON
function formatAsJSON(sections) {
const structured = sections.map(section => ({
type: section.type,
level: section.level,
content: section.content
}));
return JSON.stringify(structured, null, 2);
}
// Format as structured text
function formatAsStructuredText(sections) {
let text = '';
sections.forEach(section => {
switch(section.type) {
case 'heading':
text += '\n' + section.content[0].toUpperCase() + '\n';
text += '='.repeat(section.content[0].length) + '\n\n';
break;
case 'list':
section.content.forEach(item => {
text += ' • ' + item + '\n';
});
text += '\n';
break;
case 'table':
section.content.forEach(row => {
text += row + '\n';
});
text += '\n';
break;
case 'paragraph':
text += section.content.join(' ') + '\n\n';
break;
}
});
return text.trim();
}
// Apply intelligent Turkish corrections
function applyIntelligentTurkishCorrections(text) {
// Turkish character corrections based on context
const corrections = [
// Common OCR mistakes
{ pattern: /\bc\b/g, replacement: 'ç' },
{ pattern: /\bC\b/g, replacement: 'Ç' },
{ pattern: /\bg\b/g, replacement: 'ğ', context: /[aeiou]/i },
{ pattern: /\bG\b/g, replacement: 'Ğ', context: /[AEIOU]/i },
{ pattern: /\bi\b/g, replacement: 'ı', context: /[^iİ]/g },
{ pattern: /\bI\b/g, replacement: 'İ' },
{ pattern: /\bo\b/g, replacement: 'ö', context: /[aeiou]/i },
{ pattern: /\bO\b/g, replacement: 'Ö', context: /[AEIOU]/i },
{ pattern: /\bs\b/g, replacement: 'ş', context: /[aeiou]/i },
{ pattern: /\bS\b/g, replacement: 'Ş', context: /[AEIOU]/i },
{ pattern: /\bu\b/g, replacement: 'ü', context: /[aeiou]/i },
{ pattern: /\bU\b/g, replacement: 'Ü', context: /[AEIOU]/i },
// Number and symbol corrections
{ pattern: /0/g, replacement: 'O', context: /[A-Z]/ },
{ pattern: /1/g, replacement: 'İ', context: /[A-Z]/ },
{ pattern: /5/g, replacement: 'S', context: /[A-Z]/ },
// Common word corrections
{ pattern: /\bve\b/gi, replacement: 've' },
{ pattern: /\bile\b/gi, replacement: 'ile' },
{ pattern: /\bicin\b/gi, replacement: 'için' },
{ pattern: /\bsizin\b/gi, replacement: 'sizin' },
{ pattern: /\bbir\b/gi, replacement: 'bir' },
{ pattern: /\bbu\b/gi, replacement: 'bu' },
{ pattern: /\bsu\b/gi, replacement: 'şu' }
];
let correctedText = text;
corrections.forEach(correction => {
correctedText = correctedText.replace(correction.pattern, correction.replacement);
});
// Fix spacing around punctuation
correctedText = correctedText
.replace(/\s+([.,!?;:])/g, '$1')
.replace(/([.,!?;:])\s*/g, '$1 ')
.replace(/\s+/g, ' ')
.trim();
return correctedText;
}
function processFormattedOCR(hocr) {
// Apply learned corrections
if (window.ocrLearningDict) {
for (const [word, data] of Object.entries(window.ocrLearningDict)) {
if (data.confirmedCorrect && data.confirmedCorrect !== word) {
hocr = hocr.replace(new RegExp(word, 'g'), data.confirmedCorrect);
}
}
}
// Parse hOCR output to preserve formatting and layout
const parser = new DOMParser();
const doc = parser.parseFromString(hocr, 'text/html');
const paragraphs = doc.querySelectorAll('.ocr_par');
let formattedText = '';
paragraphs.forEach(par => {
const lines = par.querySelectorAll('.ocr_line');
lines.forEach(line => {
const words = line.querySelectorAll('.ocrx_word');
let lineText = '';
words.forEach((word, index) => {
const wordText = word.textContent || '';
const wordConfidence = parseFloat(word.getAttribute('title')
.match(/x_wconf (\d+)/)[1]);
// Better handling of Turkish characters and confidence
if (wordConfidence < 50) {
lineText += `[${wordText}] `;
} else if (wordConfidence < 70) {
lineText += `<span confidence-medium>${wordText}</span> `;
} else if (wordConfidence < 85 && /[ğüşıöçĞÜŞİÖÇ]/.test(wordText)) {
lineText += `${wordText}`;
} else {
lineText += `${wordText} `;
}
});
// Better line spacing for Turkish text
formattedText += lineText.trim() + '\n\n';
// Store problematic words for learning
words.forEach(word => {
const wordConfidence = parseFloat(word.getAttribute('title')
.match(/x_wconf (\d+)/)[1]);
if (wordConfidence < 85) {
const originalWord = word.textContent || '';
if (!window.ocrLearningDict) window.ocrLearningDict = {};
if (!window.ocrLearningDict[originalWord]) {
window.ocrLearningDict[originalWord] = {
occurrences: 0,
confirmedCorrect: null,
suggestTime: null
};
}
window.ocrLearningDict[originalWord].occurrences++;
}
});
});
formattedText += '\n';
});
return formattedText;
}
}
function displayResult(result) {
// Check if this was an OCR fallback result
const isOCRResult = result.content.includes('OCR processing attempted') ||
result.content.includes('Warning: No extractable text found');
// Scan for potential errors and ask user confirmation
if (window.ocrLearningDict) {
for (const [word, data] of Object.entries(window.ocrLearningDict)) {
if (data.confirmedCorrect === null && result.content.includes(word)) {
data.suggestTime = new Date().toISOString();
if (confirm(`Is "${word}" correctly recognized? If not, please type the correct version.`)) {
data.confirmedCorrect = word;
} else {
const corrected = prompt(`Please enter correct version for "${word}":`, word);
if (corrected) {
data.confirmedCorrect = corrected;
// Replace in current result
result.content = result.content.replace(new RegExp(word, 'g'), corrected);
}
}
}
}
}
const resultCard = document.createElement('div');
resultCard.className = 'bg-gray-50 rounded-lg p-4 shadow-sm';
const header = document.createElement('div');
header.className = 'flex justify-between items-center mb-3';
const title = document.createElement('h3');
title.className = 'font-semibold text-lg text-gray-800 truncate';
title.textContent = result.fileName;
const downloadBtn = document.createElement('button');
downloadBtn.className = 'bg-indigo-600 hover:bg-indigo-700 text-white font-medium py-1 px-3 rounded transition duration-200 flex items-center text-sm';
downloadBtn.innerHTML = '<i data-feather="download" class="mr-1"></i> Download';
downloadBtn.addEventListener('click', () => downloadResult(result));
header.appendChild(title);
header.appendChild(downloadBtn);
const content = document.createElement('div');
if (isOCRResult) {
const warning = document.createElement('div');
warning.className = 'pdf-ocr-warning';
warning.innerHTML = `
<div class="flex items-start">
<i data-feather="alert-triangle" class="mr-2"></i>
<div>
<strong>Note:</strong> This PDF was processed using OCR as no selectable text was found.
Results may contain errors or inaccuracies.
</div>
</div>
`;
content.appendChild(warning);
}
// Create pre element with proper Turkish character support
const pre = document.createElement('pre');
pre.className = result.format === 'formatted' ? 'ocr-result turkish-text' : '';
pre.style.cssText = 'font-family: monospace; white-space: pre-wrap; word-wrap: break-word; line-height: 1.6;';
// Handle content display with proper encoding
if (result.format === 'json') {
try {
const parsed = JSON.parse(result.content);
pre.textContent = JSON.stringify(parsed, null, 2);
} catch (e) {
pre.textContent = result.content;
}
} else {
pre.textContent = result.content;
}
content.appendChild(pre);
resultCard.appendChild(header);
resultCard.appendChild(content);
resultsContainer.appendChild(resultCard);
feather.replace();
}
function downloadResult(result) {
// Set proper MIME type and encoding for Turkish characters
let mimeType = 'text/plain;charset=utf-8';
let content = result.content;
if (result.format === 'json') {
mimeType = 'application/json;charset=utf-8';
} else if (result.format === 'markdown') {
mimeType = 'text/markdown;charset=utf-8';
}
// Add UTF-8 BOM for better Turkish character support in some applications
const bom = new Uint8Array([0xEF, 0xBB, 0xBF]);
const encoder = new TextEncoder();
const contentBytes = encoder.encode(content);
const combinedBytes = new Uint8Array(bom.length + contentBytes.length);
combinedBytes.set(bom);
combinedBytes.set(contentBytes, bom.length);
const blob = new Blob([combinedBytes], { type: mimeType });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `${result.fileName.split('.')[0]}.${result.format}`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
downloadAllBtn.addEventListener('click', () => {
processedResults.forEach(result => {
downloadResult(result);
});
});
});