document.addEventListener('DOMContentLoaded', function() { const uploadBtn = document.getElementById('uploadBtn'); const fileInput = document.getElementById('fileInput'); const filePreviewList = document.getElementById('filePreviewList'); const filePreviewContainer = document.getElementById('filePreviewContainer'); const processBtn = document.getElementById('processBtn'); const outputFormat = document.getElementById('outputFormat'); const resultsContainer = document.getElementById('resultsContainer'); const resultsSection = document.getElementById('resultsSection'); const downloadAllBtn = document.getElementById('downloadAllBtn'); let files = []; let processedResults = []; // Set PDF.js worker path pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.11.338/pdf.worker.min.js'; // Handle file selection uploadBtn.addEventListener('click', () => fileInput.click()); fileInput.addEventListener('change', handleFileSelection); function handleFileSelection(e) { files = Array.from(e.target.files); filePreviewList.innerHTML = ''; if (files.length === 0) { filePreviewContainer.classList.add('hidden'); return; } files.forEach((file, index) => { const filePreview = createFilePreview(file, index); filePreviewList.appendChild(filePreview); }); filePreviewContainer.classList.remove('hidden'); } function createFilePreview(file, index) { const card = document.createElement('div'); card.className = 'file-card bg-gray-50 rounded-lg p-4 flex items-center justify-between'; const fileInfo = document.createElement('div'); fileInfo.className = 'flex items-center'; const icon = document.createElement('div'); icon.className = 'bg-gray-200 p-2 rounded-full mr-3'; const fileIcon = document.createElement('i'); fileIcon.dataset.feather = getFileIcon(file); icon.appendChild(fileIcon); const fileName = document.createElement('span'); fileName.className = 'font-medium text-gray-800'; fileName.textContent = file.name; fileInfo.appendChild(icon); fileInfo.appendChild(fileName); const fileSize = document.createElement('span'); fileSize.className = 'text-gray-500 text-sm'; fileSize.textContent = formatFileSize(file.size); card.appendChild(fileInfo); card.appendChild(fileSize); feather.replace(); return card; } function getFileIcon(file) { if (file.type.includes('pdf')) return 'file'; if (file.type.includes('word') || file.type.includes('document')) return 'file-text'; if (file.type.includes('excel') || file.type.includes('spreadsheet')) return 'file-text'; if (file.type.includes('image')) return 'image'; return 'file'; } function formatFileSize(bytes) { if (bytes === 0) return '0 Bytes'; const k = 1024; const sizes = ['Bytes', 'KB', 'MB', 'GB']; const i = Math.floor(Math.log(bytes) / Math.log(k)); return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; } // Process files processBtn.addEventListener('click', async function() { if (files.length === 0) { alert('Please select at least one file'); return; } resultsContainer.innerHTML = ''; processedResults = []; processBtn.disabled = true; processBtn.innerHTML = ' Processing...'; feather.replace(); try { for (const file of files) { const result = await processFile(file); processedResults.push(result); displayResult(result); } resultsSection.classList.remove('hidden'); } catch (error) { console.error('Error processing files:', error); alert('An error occurred while processing files: ' + error.message); } finally { processBtn.disabled = false; processBtn.innerHTML = ' Process Files'; feather.replace(); } }); async function processFile(file) { const format = outputFormat.value; let content; if (file.type.includes('pdf')) { content = await extractTextFromPDF(file); } else if (file.type.includes('word') || file.type.includes('document') || file.name.endsWith('.docx') || file.name.endsWith('.doc')) { content = await extractTextFromWord(file); } else if (file.type.includes('excel') || file.type.includes('spreadsheet') || file.name.endsWith('.xlsx') || file.name.endsWith('.xls')) { content = await extractTextFromExcel(file); } else if (file.type.includes('image')) { content = await extractTextFromImage(file); } else { throw new Error('Unsupported file type: ' + file.type); } // Convert content to requested format let formattedContent; if (format === 'json') { formattedContent = { fileName: file.name, fileType: file.type, fileSize: file.size, content: content, extractedAt: new Date().toISOString() }; formattedContent = JSON.stringify(formattedContent, null, 2); } else if (format === 'markdown') { formattedContent = `# ${file.name}\n\n`; if (typeof content === 'string') { formattedContent += content; } else { formattedContent += JSON.stringify(content, null, 2) .replace(/\n/g, '\n\n') .replace(/"([^"]+)":/g, '**$1**:'); } } else { // Plain text if (typeof content === 'string') { formattedContent = content; } else { formattedContent = JSON.stringify(content, null, 2); } } return { fileName: file.name, content: formattedContent, format: format }; } async function extractTextFromPDF(file) { return new Promise((resolve, reject) => { const reader = new FileReader(); reader.onload = async function(event) { try { const typedArray = new Uint8Array(event.target.result); const pdf = await pdfjsLib.getDocument(typedArray).promise; let text = ''; for (let i = 1; i <= pdf.numPages; i++) { const page = await pdf.getPage(i); const content = await page.getTextContent(); const strings = content.items.map(item => item.str); text += strings.join(' ') + '\n\n'; } resolve(text); } catch (error) { reject(error); } }; reader.onerror = reject; reader.readAsArrayBuffer(file); }); } async function extractTextFromWord(file) { return new Promise((resolve, reject) => { const reader = new FileReader(); reader.onload = function(event) { mammoth.extractRawText({ arrayBuffer: event.target.result }) .then(function(result) { resolve(result.value); }) .catch(reject); }; reader.onerror = reject; reader.readAsArrayBuffer(file); }); } async function extractTextFromExcel(file) { return new Promise((resolve, reject) => { const reader = new FileReader(); reader.onload = function(event) { try { const data = new Uint8Array(event.target.result); const workbook = XLSX.read(data, { type: 'array' }); const result = {}; workbook.SheetNames.forEach(sheetName => { const worksheet = workbook.Sheets[sheetName]; result[sheetName] = XLSX.utils.sheet_to_json(worksheet, { header: 1 }); }); resolve(result); } catch (error) { reject(error); } }; reader.onerror = reject; reader.readAsArrayBuffer(file); }); } async function extractTextFromImage(file) { return new Promise((resolve, reject) => { Tesseract.recognize( file, 'tur+eng', // Turkish + English languages { logger: m => console.log(m), preserve_interword_spaces: true, tessedit_pageseg_mode: 6, // Assume a single uniform block of text tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ', tessedit_create_hocr: 1 // Include formatting info } ).then(({ data: { text, hocr } }) => { if (outputFormat.value === 'formatted') { // Process formatted output similar to Adobe/Abbyy const formatted = processFormattedOCR(hocr); resolve(formatted); } else { resolve(text); } }).catch(reject); }); function processFormattedOCR(hocr) { // Parse hOCR output to preserve formatting and layout const parser = new DOMParser(); const doc = parser.parseFromString(hocr, 'text/html'); const paragraphs = doc.querySelectorAll('.ocr_par'); let formattedText = ''; paragraphs.forEach(par => { const lines = par.querySelectorAll('.ocr_line'); lines.forEach(line => { const words = line.querySelectorAll('.ocrx_word'); let lineText = ''; words.forEach((word, index) => { const wordText = word.textContent || ''; const wordConfidence = parseFloat(word.getAttribute('title') .match(/x_wconf (\d+)/)[1]); // Apply formatting based on confidence and context if (wordConfidence < 60) { lineText += `[${wordText}] `; } else if (wordConfidence < 80 && index > 0) { lineText += `${wordText}`; } else { lineText += `${wordText} `; } }); formattedText += lineText.trim() + '\n'; }); formattedText += '\n'; }); return formattedText; } } function displayResult(result) { const resultCard = document.createElement('div'); resultCard.className = 'bg-gray-50 rounded-lg p-4 shadow-sm'; const header = document.createElement('div'); header.className = 'flex justify-between items-center mb-3'; const title = document.createElement('h3'); title.className = 'font-semibold text-lg text-gray-800 truncate'; title.textContent = result.fileName; const downloadBtn = document.createElement('button'); downloadBtn.className = 'bg-indigo-600 hover:bg-indigo-700 text-white font-medium py-1 px-3 rounded transition duration-200 flex items-center text-sm'; downloadBtn.innerHTML = ' Download'; downloadBtn.addEventListener('click', () => downloadResult(result)); header.appendChild(title); header.appendChild(downloadBtn); const content = document.createElement('div'); if (result.format === 'json') { const pre = document.createElement('pre'); pre.textContent = result.content; content.appendChild(pre); } else { const pre = document.createElement('pre'); pre.textContent = result.content; content.appendChild(pre); } resultCard.appendChild(header); resultCard.appendChild(content); resultsContainer.appendChild(resultCard); feather.replace(); } function downloadResult(result) { const blob = new Blob([result.content], { type: 'text/plain' }); const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; a.download = `${result.fileName.split('.')[0]}.${result.format}`; document.body.appendChild(a); a.click(); document.body.removeChild(a); URL.revokeObjectURL(url); } downloadAllBtn.addEventListener('click', () => { processedResults.forEach(result => { downloadResult(result); }); }); });