document.addEventListener('DOMContentLoaded', function() { const uploadBtn = document.getElementById('uploadBtn'); const fileInput = document.getElementById('fileInput'); const filePreviewList = document.getElementById('filePreviewList'); const filePreviewContainer = document.getElementById('filePreviewContainer'); const processBtn = document.getElementById('processBtn'); const outputFormat = document.getElementById('outputFormat'); const resultsContainer = document.getElementById('resultsContainer'); const resultsSection = document.getElementById('resultsSection'); const downloadAllBtn = document.getElementById('downloadAllBtn'); let files = []; let processedResults = []; // Set enhanced PDF.js worker path with additional configurations pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js'; // Configure PDF.js for better text extraction pdfjsLib.GlobalWorkerOptions.isEvalSupported = false; // Handle file selection uploadBtn.addEventListener('click', () => fileInput.click()); fileInput.addEventListener('change', handleFileSelection); function handleFileSelection(e) { files = Array.from(e.target.files); filePreviewList.innerHTML = ''; if (files.length === 0) { filePreviewContainer.classList.add('hidden'); return; } files.forEach((file, index) => { const filePreview = createFilePreview(file, index); filePreviewList.appendChild(filePreview); }); filePreviewContainer.classList.remove('hidden'); } function createFilePreview(file, index) { const card = document.createElement('div'); card.className = 'file-card bg-gray-50 rounded-lg p-4 flex items-center justify-between'; const fileInfo = document.createElement('div'); fileInfo.className = 'flex items-center'; const icon = document.createElement('div'); icon.className = 'bg-gray-200 p-2 rounded-full mr-3'; const fileIcon = document.createElement('i'); fileIcon.dataset.feather = getFileIcon(file); icon.appendChild(fileIcon); const fileName = document.createElement('span'); fileName.className = 'font-medium text-gray-800'; fileName.textContent = file.name; fileInfo.appendChild(icon); fileInfo.appendChild(fileName); const fileSize = document.createElement('span'); fileSize.className = 'text-gray-500 text-sm'; fileSize.textContent = formatFileSize(file.size); card.appendChild(fileInfo); card.appendChild(fileSize); feather.replace(); return card; } function getFileIcon(file) { if (file.type.includes('pdf')) return 'file'; if (file.type.includes('word') || file.type.includes('document')) return 'file-text'; if (file.type.includes('excel') || file.type.includes('spreadsheet')) return 'file-text'; if (file.type.includes('image')) return 'image'; return 'file'; } function formatFileSize(bytes) { if (bytes === 0) return '0 Bytes'; const k = 1024; const sizes = ['Bytes', 'KB', 'MB', 'GB']; const i = Math.floor(Math.log(bytes) / Math.log(k)); return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; } // Process files processBtn.addEventListener('click', async function() { if (files.length === 0) { alert('Please select at least one file'); return; } resultsContainer.innerHTML = ''; processedResults = []; processBtn.disabled = true; processBtn.innerHTML = ' Processing...'; feather.replace(); try { for (const file of files) { const result = await processFile(file); processedResults.push(result); displayResult(result); } resultsSection.classList.remove('hidden'); } catch (error) { console.error('Error processing files:', error); alert('An error occurred while processing files: ' + error.message); } finally { processBtn.disabled = false; processBtn.innerHTML = ' Process Files'; feather.replace(); } // Load additional Turkish language data function loadTurkishLanguageData() { if (!window.tesseractTurDataLoaded) { Tesseract.addLanguageData('tur', { data: '/static/tesseract/tur.traineddata.gz' }); window.tesseractTurDataLoaded = true; } } loadTurkishLanguageData(); } ); async function processFile(file) { const format = outputFormat.value; let content; if (file.type.includes('pdf')) { content = await extractTextFromPDF(file); } else if (file.type.includes('word') || file.type.includes('document') || file.name.endsWith('.docx') || file.name.endsWith('.doc')) { content = await extractTextFromWord(file); } else if (file.type.includes('excel') || file.type.includes('spreadsheet') || file.name.endsWith('.xlsx') || file.name.endsWith('.xls')) { content = await extractTextFromExcel(file); } else if (file.type.includes('image')) { content = await extractTextFromImage(file); } else { throw new Error('Unsupported file type: ' + file.type); } // Convert content to requested format let formattedContent; // Ensure content is properly encoded for Turkish characters const cleanContent = typeof content === 'string' ? content : JSON.stringify(content, null, 2); if (format === 'json') { formattedContent = { fileName: file.name, fileType: file.type, fileSize: file.size, content: cleanContent, extractedAt: new Date().toISOString() }; // Use custom replacer to handle Turkish characters properly formattedContent = JSON.stringify(formattedContent, null, 2); } else if (format === 'markdown') { formattedContent = `# ${file.name}\n\n`; formattedContent += cleanContent; } else if (format === 'formatted') { // Apply better formatting for Turkish text formattedContent = cleanContent .replace(/([.!?])\s*/g, '$1\n\n') // Better paragraph breaks .replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks .replace(/([A-ZÇĞİÖŞÜ][a-zçğıöşü]+)\s+/g, '$1 '); // Preserve Turkish words } else { // Plain text - ensure Turkish characters are preserved formattedContent = cleanContent; } return { fileName: file.name, content: formattedContent, format: format }; } async function extractTextFromPDF(file) { return new Promise(async (resolve, reject) => { const reader = new FileReader(); reader.onload = async function(event) { try { // Create a copy of the ArrayBuffer to avoid detachment issues const arrayBuffer = event.target.result; const typedArray = new Uint8Array(arrayBuffer.slice(0)); // Enhanced PDF loading with multiple extraction strategies const loadingTask = pdfjsLib.getDocument({ data: typedArray.buffer, cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/', cMapPacked: true, standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/', useSystemFonts: true, useWorkerFetch: true, isEvalSupported: false, disableAutoFetch: false, disableStream: false }); const pdf = await loadingTask.promise; let fullText = ''; let metadata = await pdf.getMetadata(); // Strategy 1: Enhanced text extraction with structural analysis for (let i = 1; i <= pdf.numPages; i++) { const page = await pdf.getPage(i); // Get viewport for better text positioning const viewport = page.getViewport({ scale: 2.0 }); // Enhanced text content extraction const textContent = await page.getTextContent({ normalizeWhitespace: false, disableCombineTextItems: false, includeMarkedContent: true }); // Process text items with better grouping const textItems = textContent.items; let pageText = ''; let lastY = null; let lastX = null; for (let j = 0; j < textItems.length; j++) { const item = textItems[j]; const tx = pdfjsLib.Util.transform( viewport.transform, item.transform ); const x = tx[4]; const y = tx[5]; // Add line breaks based on Y position if (lastY !== null && Math.abs(y - lastY) > item.height * 0.8) { pageText += '\n'; } // Add spaces based on X position if (lastX !== null && x - lastX > item.width * 0.3) { pageText += ' '; } pageText += item.str; lastY = y; lastX = x + item.width; } // Clean up and format the text pageText = pageText .replace(/\s+/g, ' ') .replace(/\n\s*\n/g, '\n\n') .trim(); if (pageText) { fullText += pageText + '\n\n'; } } // Strategy 2: Enhanced Turkish character decoding fullText = decodeTurkishText(fullText); // Strategy 3: If still poor quality, try OCR with preprocessing if (!fullText.trim() || fullText.trim().length < 50) { console.warn('Primary text extraction failed, attempting enhanced OCR...'); // Create a fresh copy for OCR to avoid detachment const ocrArrayBuffer = arrayBuffer.slice(0); fullText = await enhancedOCRFallback(ocrArrayBuffer); } // Strategy 4: Apply text quality improvements fullText = improveTextQuality(fullText); resolve(fullText); } catch (error) { console.error('PDF extraction error:', error); // Try a simpler extraction method as fallback try { console.warn('Attempting simplified PDF extraction...'); const simpleArray = new Uint8Array(arrayBuffer.slice(0)); const simpleLoadingTask = pdfjsLib.getDocument(simpleArray.buffer); const simplePdf = await simpleLoadingTask.promise; let simpleText = ''; for (let i = 1; i <= Math.min(simplePdf.numPages, 5); i++) { const page = await simplePdf.getPage(i); const simpleContent = await page.getTextContent(); const pageText = simpleContent.items.map(item => item.str).join(' '); simpleText += pageText + '\n'; } if (simpleText.trim()) { resolve(decodeTurkishText(improveTextQuality(simpleText))); return; } } catch (fallbackError) { console.error('Fallback extraction also failed:', fallbackError); } reject(new Error('Failed to extract text from PDF: ' + error.message)); } }; reader.onerror = () => reject(new Error('Failed to read PDF file')); reader.readAsArrayBuffer(file); }); } // Enhanced Turkish text decoding function decodeTurkishText(text) { // Multiple encoding fixes for Turkish characters const fixes = [ // UTF-8 double encoding [/\u00C3\u00A7/g, 'ç'], [/\u00C3\u0087/g, 'Ç'], [/\u00C3\u011F/g, 'ğ'], [/\u00C4\u0178/g, 'Ğ'], [/\u00C3\u00BC/g, 'ü'], [/\u00C3\u009C/g, 'Ü'], [/\u00C3\u015F/g, 'ş'], [/\u00C5\u0178/g, 'Ş'], [/\u00C3\u0131/g, 'ı'], [/\u00C4\u0131/g, 'İ'], [/\u00C3\u00B6/g, 'ö'], [/\u00C3\u0096/g, 'Ö'], // ISO-8859-9 to UTF-8 [/[\u00C4\u00E4]/g, 'ä'], [/[\u00C5\u00E5]/g, 'å'], [/[\u00C6\u00E6]/g, 'æ'], [/[\u00C7\u00E7]/g, 'ç'], [/[\u00D0\u00F0]/g, 'ð'], [/[\u011E\u011F]/g, 'ğ'], [/[\u0130\u0131]/g, 'ı'], [/[\u015E\u015F]/g, 'ş'], [/[\u00D6\u00F6]/g, 'ö'], [/[\u00DC\u00FC]/g, 'ü'], [/[\u00DE\u00FE]/g, 'þ'], // Common OCR errors [/c/g, 'ç', { context: 'turkish' }], [/C/g, 'Ç', { context: 'turkish' }], [/g/g, 'ğ', { context: 'turkish' }], [/G/g, 'Ğ', { context: 'turkish' }], [/i/g, 'ı', { context: 'turkish' }], [/I/g, 'İ', { context: 'turkish' }], [/o/g, 'ö', { context: 'turkish' }], [/O/g, 'Ö', { context: 'turkish' }], [/s/g, 'ş', { context: 'turkish' }], [/S/g, 'Ş', { context: 'turkish' }], [/u/g, 'ü', { context: 'turkish' }], [/U/g, 'Ü', { context: 'turkish' }] ]; let decodedText = text; fixes.forEach(fix => { if (Array.isArray(fix) && fix.length === 2) { decodedText = decodedText.replace(fix[0], fix[1]); } }); // Apply HTML entity decoding if needed try { decodedText = he.decode(decodedText); } catch (e) { console.warn('HTML decoding failed:', e); } return decodedText; } // Enhanced OCR fallback with multiple engines async function enhancedOCRFallback(pdfData) { // Ensure we have a valid ArrayBuffer let arrayBuffer; if (pdfData instanceof ArrayBuffer) { arrayBuffer = pdfData; } else if (pdfData instanceof Uint8Array) { arrayBuffer = pdfData.buffer; } else { throw new Error('Invalid PDF data format for OCR fallback'); } const images = await convertPDFToImagesEnhanced(arrayBuffer); let ocrResults = []; for (const image of images) { // Try multiple OCR approaches const results = await Promise.allSettled([ // Tesseract with Turkish and English extractTextWithTesseract(image, 'tur+eng'), // Tesseract with additional preprocessing extractTextWithTesseract(image, 'tur+eng', { preprocess: true }), // Fallback to English only if Turkish fails extractTextWithTesseract(image, 'eng') ]); // Find the best result let bestResult = ''; let maxLength = 0; results.forEach(result => { if (result.status === 'fulfilled' && result.value.length > maxLength) { bestResult = result.value; maxLength = result.value.length; } }); if (bestResult) { ocrResults.push(bestResult); } } return ocrResults.join('\n\n') || 'OCR processing completed but no text was extracted.'; } // Enhanced Tesseract extraction async function extractTextWithTesseract(image, languages = 'tur+eng', options = {}) { try { const config = { logger: m => console.log(`Tesseract: ${m.status} - ${Math.round(m.progress * 100)}%`), preserve_interword_spaces: '1', tessedit_pageseg_mode: '6', tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ğüşıöçĞÜŞİÖÇ@#$%^&*+=<>:;_ ', load_system_dawg: '1', load_freq_dawg: '1' }; if (options.preprocess) { // Apply image preprocessing image = await preprocessImage(image); } const result = await Tesseract.recognize(image, languages, config); return result.data.text; } catch (error) { console.error('Tesseract OCR error:', error); throw error; } } // Image preprocessing for better OCR async function preprocessImage(canvas) { const ctx = canvas.getContext('2d'); const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height); const data = imageData.data; // Convert to grayscale for (let i = 0; i < data.length; i += 4) { const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114; data[i] = gray; data[i + 1] = gray; data[i + 2] = gray; } // Apply adaptive thresholding const threshold = 128; for (let i = 0; i < data.length; i += 4) { const value = data[i] > threshold ? 255 : 0; data[i] = value; data[i + 1] = value; data[i + 2] = value; } ctx.putImageData(imageData, 0, 0); return canvas; } // Enhanced PDF to image conversion async function convertPDFToImagesEnhanced(pdfData) { // Ensure we have a fresh copy of the data let data; if (pdfData instanceof ArrayBuffer) { data = new Uint8Array(pdfData.slice(0)); } else if (pdfData instanceof Uint8Array) { data = new Uint8Array(pdfData.buffer.slice(0)); } else { throw new Error('Invalid PDF data format for image conversion'); } const loadingTask = pdfjsLib.getDocument({ data: data.buffer, cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/', cMapPacked: true, standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/', // Disable worker for image conversion to avoid detachment issues useWorkerFetch: false, isEvalSupported: false, disableWorker: true }); const pdf = await loadingTask.promise; const images = []; // Process all pages with higher resolution for (let i = 1; i <= Math.min(pdf.numPages, 10); i++) { const page = await pdf.getPage(i); const viewport = page.getViewport({ scale: 3.0 }); const canvas = document.createElement('canvas'); const context = canvas.getContext('2d'); canvas.height = viewport.height; canvas.width = viewport.width; // Render with better quality await page.render({ canvasContext: context, viewport: viewport, renderInteractiveForms: true, intent: 'print' }).promise; images.push(canvas); } return images; } // Text quality improvement function improveTextQuality(text) { return text // Fix common OCR errors in Turkish .replace(/\bi\b/g, 'ı') // Turkish dotless i .replace(/\bI\b/g, 'İ') // Turkish capital I with dot .replace(/c([aeiou])/gi, 'ç$1') // c followed by vowel -> ç .replace(/C([AEIOU])/g, 'Ç$1') .replace(/g([aeiou])/gi, 'ğ$1') // g followed by vowel -> ğ .replace(/G([AEIOU])/g, 'Ğ$1') .replace(/s([aeiou])/gi, 'ş$1') // s followed by vowel -> ş .replace(/S([AEIOU])/g, 'Ş$1') .replace(/o([aeiou])/gi, 'ö$1') // o followed by vowel -> ö .replace(/O([AEIOU])/g, 'Ö$1') .replace(/u([aeiou])/gi, 'ü$1') // u followed by vowel -> ü .replace(/U([AEIOU])/g, 'Ü$1') // Clean up spacing .replace(/\s+/g, ' ') .replace(/\n\s*\n/g, '\n\n') // Fix common character confusion .replace(/0/g, 'O', { condition: context => /[A-Z]/.test(context.after) }) .replace(/1/g, 'I', { condition: context => /[A-Z]/.test(context.after) }) .replace(/5/g, 'S', { condition: context => /[A-Z]/.test(context.after) }) .trim(); } async function extractTextFromWord(file) { return new Promise(async (resolve, reject) => { const reader = new FileReader(); reader.onload = async function(event) { try { // Enhanced Word document extraction const result = await mammoth.extractRawText({ arrayBuffer: event.target.result, options: { includeDefaultStyleMap: true, styleMap: [ "p[style-name='Heading 1'] => h1:fresh", "p[style-name='Heading 2'] => h2:fresh", "p[style-name='Heading 3'] => h3:fresh", "p[style-name='Title'] => h1.title:fresh", "r[style-name='Strong'] => strong", "r[style-name='Emphasis'] => em" ] } }); let text = result.value; // Apply Turkish character decoding text = decodeTurkishText(text); // Apply text quality improvements text = improveTextQuality(text); // Try alternative extraction if result is poor if (text.trim().length < 50) { console.warn('Primary Word extraction failed, trying alternative...'); const altResult = await mammoth.convertToMarkdown({ arrayBuffer: event.target.result }); if (altResult.value && altResult.value.trim().length > text.trim().length) { text = altResult.value; text = decodeTurkishText(text); text = improveTextQuality(text); } } resolve(text); } catch (error) { reject(error); } }; reader.onerror = reject; reader.readAsArrayBuffer(file); }); } async function extractTextFromExcel(file) { return new Promise(async (resolve, reject) => { const reader = new FileReader(); reader.onload = async function(event) { try { const data = new Uint8Array(event.target.result); // Enhanced Excel reading with Turkish support const workbook = XLSX.read(data, { type: 'array', codepage: 1254, // Turkish codepage cellStyles: true, cellHTML: false }); const result = {}; workbook.SheetNames.forEach(sheetName => { const worksheet = workbook.Sheets[sheetName]; // Try multiple extraction methods const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1, raw: false, dateNF: 'dd/mm/yyyy', defval: '' }); const csvData = XLSX.utils.sheet_to_csv(worksheet, { FS: '\t', RS: '\n', dateNF: 'dd/mm/yyyy' }); // Process data with Turkish character support const processedData = jsonData.map(row => row.map(cell => { if (typeof cell === 'string') { return decodeTurkishText(improveTextQuality(cell)); } return cell; }) ); result[sheetName] = { data: processedData, csv: decodeTurkishText(csvData), range: worksheet['!ref'] || '', rowCount: jsonData.length, colCount: jsonData[0] ? jsonData[0].length : 0 }; }); resolve(result); } catch (error) { reject(error); } }; reader.onerror = reject; reader.readAsArrayBuffer(file); }); } async function convertPDFToImages(pdfData) { // Create a fresh copy before processing if (pdfData instanceof ArrayBuffer) { return await convertPDFToImagesEnhanced(pdfData.slice(0)); } else if (pdfData instanceof Uint8Array) { return await convertPDFToImagesEnhanced(pdfData.buffer.slice(0)); } return await convertPDFToImagesEnhanced(pdfData); } async function extractTextFromImage(file) { return new Promise(async (resolve, reject) => { try { const imageElement = file instanceof HTMLCanvasElement ? file : file; // Apply advanced preprocessing const processedImages = await applyAdvancedPreprocessing(imageElement); // Multi-strategy OCR approach const ocrResults = []; for (const processedImage of processedImages) { const results = await Promise.allSettled([ // Strategy 1: Turkish with best settings performAdvancedOCR(processedImage, 'tur', { tessedit_pageseg_mode: '6', preserve_interword_spaces: '1', tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ', tessedit_ocr_engine_mode: '1', tessedit_do_ocr: '1', tessedit_load_image: '1' }), // Strategy 2: Turkish+English with auto segmentation performAdvancedOCR(processedImage, 'tur+eng', { tessedit_pageseg_mode: '1', preserve_interword_spaces: '1', tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ', tessedit_ocr_engine_mode: '1' }), // Strategy 3: Single column mode performAdvancedOCR(processedImage, 'tur', { tessedit_pageseg_mode: '3', preserve_interword_spaces: '1', tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ' }) ]); results.forEach(result => { if (result.status === 'fulfilled' && result.value.text.trim().length > 10) { ocrResults.push({ text: result.value.text, confidence: result.value.confidence || calculateConfidence(result.value.text), strategy: result.value.strategy }); } }); } // Select best result using advanced scoring const bestResult = selectBestResult(ocrResults); if (bestResult) { // Apply document structure analysis const structuredText = await analyzeDocumentStructure(bestResult.text); // Apply intelligent Turkish text corrections const correctedText = applyIntelligentTurkishCorrections(structuredText); resolve(correctedText); } else { resolve('No text could be extracted from the image.'); } } catch (error) { console.error('Enhanced image OCR error:', error); reject(error); } }); // Apply advanced image preprocessing techniques async function applyAdvancedPreprocessing(imageElement) { const processedImages = []; // Original image if (imageElement instanceof HTMLCanvasElement) { processedImages.push(imageElement); } else { const canvas = await imageToCanvas(imageElement); processedImages.push(canvas); } // Enhanced preprocessing variations const variations = [ // High contrast await applyImageEnhancement(processedImages[0], 'contrast'), // Denoised await applyImageEnhancement(processedImages[0], 'denoise'), // Sharpened await applyImageEnhancement(processedImages[0], 'sharpen'), // Binarized await applyImageEnhancement(processedImages[0], 'binarize') ]; processedImages.push(...variations.filter(img => img !== null)); return processedImages; } // Convert image to canvas async function imageToCanvas(image) { return new Promise((resolve) => { const img = new Image(); img.onload = () => { const canvas = document.createElement('canvas'); canvas.width = img.width; canvas.height = img.height; const ctx = canvas.getContext('2d'); ctx.drawImage(img, 0, 0); resolve(canvas); }; img.src = image instanceof HTMLCanvasElement ? image.toDataURL() : URL.createObjectURL(image); }); } // Apply specific image enhancement async function applyImageEnhancement(canvas, type) { const ctx = canvas.getContext('2d'); const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height); const data = imageData.data; switch(type) { case 'contrast': // Enhance contrast const contrast = 1.5; for (let i = 0; i < data.length; i += 4) { data[i] = ((data[i] - 128) * contrast) + 128; data[i + 1] = ((data[i + 1] - 128) * contrast) + 128; data[i + 2] = ((data[i + 2] - 128) * contrast) + 128; } break; case 'denoise': // Simple noise reduction for (let i = 0; i < data.length; i += 4) { const avg = (data[i] + data[i + 1] + data[i + 2]) / 3; const threshold = 30; if (Math.abs(data[i] - avg) > threshold) data[i] = avg; if (Math.abs(data[i + 1] - avg) > threshold) data[i + 1] = avg; if (Math.abs(data[i + 2] - avg) > threshold) data[i + 2] = avg; } break; case 'sharpen': // Sharpen filter const weights = [0, -1, 0, -1, 5, -1, 0, -1, 0]; const side = Math.round(Math.sqrt(weights.length)); const halfSide = Math.floor(side / 2); const output = ctx.createImageData(canvas.width, canvas.height); const dst = output.data; for (let y = 0; y < canvas.height; y++) { for (let x = 0; x < canvas.width; x++) { const dstOff = (y * canvas.width + x) * 4; let r = 0, g = 0, b = 0; for (let cy = 0; cy < side; cy++) { for (let cx = 0; cx < side; cx++) { const scy = y + cy - halfSide; const scx = x + cx - halfSide; if (scy >= 0 && scy < canvas.height && scx >= 0 && scx < canvas.width) { const srcOff = (scy * canvas.width + scx) * 4; const wt = weights[cy * side + cx]; r += data[srcOff] * wt; g += data[srcOff + 1] * wt; b += data[srcOff + 2] * wt; } } } dst[dstOff] = r; dst[dstOff + 1] = g; dst[dstOff + 2] = b; dst[dstOff + 3] = 255; } } ctx.putImageData(output, 0, 0); return canvas; case 'binarize': // Adaptive thresholding for (let i = 0; i < data.length; i += 4) { const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114; const threshold = gray > 128 ? 255 : 0; data[i] = threshold; data[i + 1] = threshold; data[i + 2] = threshold; } break; } ctx.putImageData(imageData, 0, 0); return canvas; } // Advanced OCR processing async function performAdvancedOCR(image, languages, config) { try { const result = await Tesseract.recognize(image, languages, { logger: m => console.log(`OCR (${languages}): ${m.status} - ${Math.round(m.progress * 100)}%`), ...config }); return { text: result.data.text, confidence: result.data.confidence || 0, strategy: `OCR_${languages}_${config.tessedit_pageseg_mode}` }; } catch (error) { console.error(`OCR strategy failed:`, error); return { text: '', confidence: 0 }; } } // Select best OCR result using advanced scoring function selectBestResult(results) { if (results.length === 0) return null; let bestScore = -1; let bestResult = null; results.forEach(result => { const score = calculateAdvancedScore(result.text, result.confidence); if (score > bestScore) { bestScore = score; bestResult = result; } }); return bestResult; } // Calculate advanced scoring for OCR results function calculateAdvancedScore(text, baseConfidence) { if (!text || text.trim().length === 0) return 0; let score = baseConfidence || 0; // Turkish character detection (40% weight) const turkishChars = (text.match(/[ğüşıöçĞÜŞİÖÇ]/g) || []).length; const totalChars = text.replace(/\s/g, '').length; const turkishRatio = totalChars > 0 ? turkishChars / totalChars : 0; score += turkishRatio * 40; // Word detection (20% weight) const words = text.match(/\b\w+\b/g) || []; const turkishWords = words.filter(word => /[ğıüşiöçĞIÜŞİÖÇ]/.test(word)); const wordRatio = words.length > 0 ? turkishWords.length / words.length : 0; score += wordRatio * 20; // Sentence structure (20% weight) const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 5); const avgSentenceLength = sentences.length > 0 ? sentences.reduce((sum, s) => sum + s.split(/\s+/).length, 0) / sentences.length : 0; const sentenceScore = Math.min(avgSentenceLength / 10, 1); score += sentenceScore * 20; // Text length penalty for very short texts if (text.trim().length < 20) score *= 0.5; return Math.min(score, 100); } // Analyze document structure like Abbyy FineReader async function analyzeDocumentStructure(text) { // Split text into potential sections const lines = text.split('\n').filter(line => line.trim().length > 0); const structuredSections = []; let currentSection = { type: 'paragraph', content: [], level: 0 }; for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); // Detect headings if (isHeading(line)) { if (currentSection.content.length > 0) { structuredSections.push(currentSection); } currentSection = { type: 'heading', content: [line], level: detectHeadingLevel(line) }; } // Detect lists else if (isListItem(line)) { if (currentSection.type !== 'list') { if (currentSection.content.length > 0) { structuredSections.push(currentSection); } currentSection = { type: 'list', content: [], level: 0 }; } currentSection.content.push(line); } // Detect tables else if (isTableRow(line)) { if (currentSection.type !== 'table') { if (currentSection.content.length > 0) { structuredSections.push(currentSection); } currentSection = { type: 'table', content: [], level: 0 }; } currentSection.content.push(line); } // Regular paragraph else { if (currentSection.type !== 'paragraph') { if (currentSection.content.length > 0) { structuredSections.push(currentSection); } currentSection = { type: 'paragraph', content: [], level: 0 }; } currentSection.content.push(line); } } if (currentSection.content.length > 0) { structuredSections.push(currentSection); } return formatStructuredText(structuredSections); } // Check if line is a heading function isHeading(line) { // Short lines with all caps or title case are likely headings if (line.length < 50 && line.split(/\s+/).length <= 8) { const words = line.split(/\s+/); const titleWords = words.filter(word => /^[A-ZÇĞİÖŞÜ]/.test(word)); return titleWords.length / words.length > 0.6; } // Lines with colon at end are often headings if (line.endsWith(':')) return true; // Numbered headings if (/^\d+\.?\s+[A-ZÇĞİÖŞÜ]/.test(line)) return true; return false; } // Detect heading level function detectHeadingLevel(line) { if (/^\d+\.\d+\s+/.test(line)) return 2; if (/^\d+\s+/.test(line)) return 1; if (line.length < 30) return 1; if (line.length < 40) return 2; return 3; } // Check if line is a list item function isListItem(line) { return /^[-*•]\s+/.test(line) || /^\d+\.\s+/.test(line) || /^\([a-z]\)\s+/.test(line); } // Check if line is a table row function isTableRow(line) { // Multiple tabs or multiple pipes suggest table return (line.split('\t').length > 2) || (line.split('|').length > 3); } // Format structured text based on output format function formatStructuredText(sections) { if (outputFormat.value === 'markdown') { return formatAsMarkdown(sections); } else if (outputFormat.value === 'json') { return formatAsJSON(sections); } else if (outputFormat.value === 'formatted') { return formatAsStructuredText(sections); } return sections.map(s => s.content.join(' ')).join('\n\n'); } // Format as Markdown function formatAsMarkdown(sections) { let markdown = ''; sections.forEach(section => { switch(section.type) { case 'heading': const hashes = '#'.repeat(section.level); markdown += `${hashes} ${section.content[0]}\n\n`; break; case 'list': section.content.forEach(item => { markdown += `- ${item}\n`; }); markdown += '\n'; break; case 'table': section.content.forEach(row => { markdown += `| ${row.split(/\t+|\|/).join(' | ')} |\n`; }); markdown += '\n'; break; case 'paragraph': markdown += section.content.join(' ') + '\n\n'; break; } }); return markdown.trim(); } // Format as JSON function formatAsJSON(sections) { const structured = sections.map(section => ({ type: section.type, level: section.level, content: section.content })); return JSON.stringify(structured, null, 2); } // Format as structured text function formatAsStructuredText(sections) { let text = ''; sections.forEach(section => { switch(section.type) { case 'heading': text += '\n' + section.content[0].toUpperCase() + '\n'; text += '='.repeat(section.content[0].length) + '\n\n'; break; case 'list': section.content.forEach(item => { text += ' • ' + item + '\n'; }); text += '\n'; break; case 'table': section.content.forEach(row => { text += row + '\n'; }); text += '\n'; break; case 'paragraph': text += section.content.join(' ') + '\n\n'; break; } }); return text.trim(); } // Apply intelligent Turkish corrections function applyIntelligentTurkishCorrections(text) { // Turkish character corrections based on context const corrections = [ // Common OCR mistakes { pattern: /\bc\b/g, replacement: 'ç' }, { pattern: /\bC\b/g, replacement: 'Ç' }, { pattern: /\bg\b/g, replacement: 'ğ', context: /[aeiou]/i }, { pattern: /\bG\b/g, replacement: 'Ğ', context: /[AEIOU]/i }, { pattern: /\bi\b/g, replacement: 'ı', context: /[^iİ]/g }, { pattern: /\bI\b/g, replacement: 'İ' }, { pattern: /\bo\b/g, replacement: 'ö', context: /[aeiou]/i }, { pattern: /\bO\b/g, replacement: 'Ö', context: /[AEIOU]/i }, { pattern: /\bs\b/g, replacement: 'ş', context: /[aeiou]/i }, { pattern: /\bS\b/g, replacement: 'Ş', context: /[AEIOU]/i }, { pattern: /\bu\b/g, replacement: 'ü', context: /[aeiou]/i }, { pattern: /\bU\b/g, replacement: 'Ü', context: /[AEIOU]/i }, // Number and symbol corrections { pattern: /0/g, replacement: 'O', context: /[A-Z]/ }, { pattern: /1/g, replacement: 'İ', context: /[A-Z]/ }, { pattern: /5/g, replacement: 'S', context: /[A-Z]/ }, // Common word corrections { pattern: /\bve\b/gi, replacement: 've' }, { pattern: /\bile\b/gi, replacement: 'ile' }, { pattern: /\bicin\b/gi, replacement: 'için' }, { pattern: /\bsizin\b/gi, replacement: 'sizin' }, { pattern: /\bbir\b/gi, replacement: 'bir' }, { pattern: /\bbu\b/gi, replacement: 'bu' }, { pattern: /\bsu\b/gi, replacement: 'şu' } ]; let correctedText = text; corrections.forEach(correction => { correctedText = correctedText.replace(correction.pattern, correction.replacement); }); // Fix spacing around punctuation correctedText = correctedText .replace(/\s+([.,!?;:])/g, '$1') .replace(/([.,!?;:])\s*/g, '$1 ') .replace(/\s+/g, ' ') .trim(); return correctedText; } function processFormattedOCR(hocr) { // Apply learned corrections if (window.ocrLearningDict) { for (const [word, data] of Object.entries(window.ocrLearningDict)) { if (data.confirmedCorrect && data.confirmedCorrect !== word) { hocr = hocr.replace(new RegExp(word, 'g'), data.confirmedCorrect); } } } // Parse hOCR output to preserve formatting and layout const parser = new DOMParser(); const doc = parser.parseFromString(hocr, 'text/html'); const paragraphs = doc.querySelectorAll('.ocr_par'); let formattedText = ''; paragraphs.forEach(par => { const lines = par.querySelectorAll('.ocr_line'); lines.forEach(line => { const words = line.querySelectorAll('.ocrx_word'); let lineText = ''; words.forEach((word, index) => { const wordText = word.textContent || ''; const wordConfidence = parseFloat(word.getAttribute('title') .match(/x_wconf (\d+)/)[1]); // Better handling of Turkish characters and confidence if (wordConfidence < 50) { lineText += `[${wordText}] `; } else if (wordConfidence < 70) { lineText += `${wordText} `; } else if (wordConfidence < 85 && /[ğüşıöçĞÜŞİÖÇ]/.test(wordText)) { lineText += `${wordText}`; } else { lineText += `${wordText} `; } }); // Better line spacing for Turkish text formattedText += lineText.trim() + '\n\n'; // Store problematic words for learning words.forEach(word => { const wordConfidence = parseFloat(word.getAttribute('title') .match(/x_wconf (\d+)/)[1]); if (wordConfidence < 85) { const originalWord = word.textContent || ''; if (!window.ocrLearningDict) window.ocrLearningDict = {}; if (!window.ocrLearningDict[originalWord]) { window.ocrLearningDict[originalWord] = { occurrences: 0, confirmedCorrect: null, suggestTime: null }; } window.ocrLearningDict[originalWord].occurrences++; } }); }); formattedText += '\n'; }); return formattedText; } } function displayResult(result) { // Check if this was an OCR fallback result const isOCRResult = result.content.includes('OCR processing attempted') || result.content.includes('Warning: No extractable text found'); // Scan for potential errors and ask user confirmation if (window.ocrLearningDict) { for (const [word, data] of Object.entries(window.ocrLearningDict)) { if (data.confirmedCorrect === null && result.content.includes(word)) { data.suggestTime = new Date().toISOString(); if (confirm(`Is "${word}" correctly recognized? If not, please type the correct version.`)) { data.confirmedCorrect = word; } else { const corrected = prompt(`Please enter correct version for "${word}":`, word); if (corrected) { data.confirmedCorrect = corrected; // Replace in current result result.content = result.content.replace(new RegExp(word, 'g'), corrected); } } } } } const resultCard = document.createElement('div'); resultCard.className = 'bg-gray-50 rounded-lg p-4 shadow-sm'; const header = document.createElement('div'); header.className = 'flex justify-between items-center mb-3'; const title = document.createElement('h3'); title.className = 'font-semibold text-lg text-gray-800 truncate'; title.textContent = result.fileName; const downloadBtn = document.createElement('button'); downloadBtn.className = 'bg-indigo-600 hover:bg-indigo-700 text-white font-medium py-1 px-3 rounded transition duration-200 flex items-center text-sm'; downloadBtn.innerHTML = ' Download'; downloadBtn.addEventListener('click', () => downloadResult(result)); header.appendChild(title); header.appendChild(downloadBtn); const content = document.createElement('div'); if (isOCRResult) { const warning = document.createElement('div'); warning.className = 'pdf-ocr-warning'; warning.innerHTML = `
Note: This PDF was processed using OCR as no selectable text was found. Results may contain errors or inaccuracies.
`; content.appendChild(warning); } // Create pre element with proper Turkish character support const pre = document.createElement('pre'); pre.className = result.format === 'formatted' ? 'ocr-result turkish-text' : ''; pre.style.cssText = 'font-family: monospace; white-space: pre-wrap; word-wrap: break-word; line-height: 1.6;'; // Handle content display with proper encoding if (result.format === 'json') { try { const parsed = JSON.parse(result.content); pre.textContent = JSON.stringify(parsed, null, 2); } catch (e) { pre.textContent = result.content; } } else { pre.textContent = result.content; } content.appendChild(pre); resultCard.appendChild(header); resultCard.appendChild(content); resultsContainer.appendChild(resultCard); feather.replace(); } function downloadResult(result) { // Set proper MIME type and encoding for Turkish characters let mimeType = 'text/plain;charset=utf-8'; let content = result.content; if (result.format === 'json') { mimeType = 'application/json;charset=utf-8'; } else if (result.format === 'markdown') { mimeType = 'text/markdown;charset=utf-8'; } // Add UTF-8 BOM for better Turkish character support in some applications const bom = new Uint8Array([0xEF, 0xBB, 0xBF]); const encoder = new TextEncoder(); const contentBytes = encoder.encode(content); const combinedBytes = new Uint8Array(bom.length + contentBytes.length); combinedBytes.set(bom); combinedBytes.set(contentBytes, bom.length); const blob = new Blob([combinedBytes], { type: mimeType }); const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; a.download = `${result.fileName.split('.')[0]}.${result.format}`; document.body.appendChild(a); a.click(); document.body.removeChild(a); URL.revokeObjectURL(url); } downloadAllBtn.addEventListener('click', () => { processedResults.forEach(result => { downloadResult(result); }); }); });