Spaces:
Running
Running
| document.addEventListener('DOMContentLoaded', function() { | |
| const uploadBtn = document.getElementById('uploadBtn'); | |
| const fileInput = document.getElementById('fileInput'); | |
| const filePreviewList = document.getElementById('filePreviewList'); | |
| const filePreviewContainer = document.getElementById('filePreviewContainer'); | |
| const processBtn = document.getElementById('processBtn'); | |
| const outputFormat = document.getElementById('outputFormat'); | |
| const resultsContainer = document.getElementById('resultsContainer'); | |
| const resultsSection = document.getElementById('resultsSection'); | |
| const downloadAllBtn = document.getElementById('downloadAllBtn'); | |
| let files = []; | |
| let processedResults = []; | |
| // Set PDF.js worker path | |
| pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.11.338/pdf.worker.min.js'; | |
| // Handle file selection | |
| uploadBtn.addEventListener('click', () => fileInput.click()); | |
| fileInput.addEventListener('change', handleFileSelection); | |
| function handleFileSelection(e) { | |
| files = Array.from(e.target.files); | |
| filePreviewList.innerHTML = ''; | |
| if (files.length === 0) { | |
| filePreviewContainer.classList.add('hidden'); | |
| return; | |
| } | |
| files.forEach((file, index) => { | |
| const filePreview = createFilePreview(file, index); | |
| filePreviewList.appendChild(filePreview); | |
| }); | |
| filePreviewContainer.classList.remove('hidden'); | |
| } | |
| function createFilePreview(file, index) { | |
| const card = document.createElement('div'); | |
| card.className = 'file-card bg-gray-50 rounded-lg p-4 flex items-center justify-between'; | |
| const fileInfo = document.createElement('div'); | |
| fileInfo.className = 'flex items-center'; | |
| const icon = document.createElement('div'); | |
| icon.className = 'bg-gray-200 p-2 rounded-full mr-3'; | |
| const fileIcon = document.createElement('i'); | |
| fileIcon.dataset.feather = getFileIcon(file); | |
| icon.appendChild(fileIcon); | |
| const fileName = document.createElement('span'); | |
| fileName.className = 'font-medium text-gray-800'; | |
| fileName.textContent = file.name; | |
| fileInfo.appendChild(icon); | |
| fileInfo.appendChild(fileName); | |
| const fileSize = document.createElement('span'); | |
| fileSize.className = 'text-gray-500 text-sm'; | |
| fileSize.textContent = formatFileSize(file.size); | |
| card.appendChild(fileInfo); | |
| card.appendChild(fileSize); | |
| feather.replace(); | |
| return card; | |
| } | |
| function getFileIcon(file) { | |
| if (file.type.includes('pdf')) return 'file'; | |
| if (file.type.includes('word') || file.type.includes('document')) return 'file-text'; | |
| if (file.type.includes('excel') || file.type.includes('spreadsheet')) return 'file-text'; | |
| if (file.type.includes('image')) return 'image'; | |
| return 'file'; | |
| } | |
| function formatFileSize(bytes) { | |
| if (bytes === 0) return '0 Bytes'; | |
| const k = 1024; | |
| const sizes = ['Bytes', 'KB', 'MB', 'GB']; | |
| const i = Math.floor(Math.log(bytes) / Math.log(k)); | |
| return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; | |
| } | |
| // Process files | |
| processBtn.addEventListener('click', async function() { | |
| if (files.length === 0) { | |
| alert('Please select at least one file'); | |
| return; | |
| } | |
| resultsContainer.innerHTML = ''; | |
| processedResults = []; | |
| processBtn.disabled = true; | |
| processBtn.innerHTML = '<i data-feather="loader" class="spinner mr-2"></i> Processing...'; | |
| feather.replace(); | |
| try { | |
| for (const file of files) { | |
| const result = await processFile(file); | |
| processedResults.push(result); | |
| displayResult(result); | |
| } | |
| resultsSection.classList.remove('hidden'); | |
| } catch (error) { | |
| console.error('Error processing files:', error); | |
| alert('An error occurred while processing files: ' + error.message); | |
| } finally { | |
| processBtn.disabled = false; | |
| processBtn.innerHTML = '<i data-feather="cpu" class="mr-2"></i> Process Files'; | |
| feather.replace(); | |
| } | |
| }); | |
| async function processFile(file) { | |
| const format = outputFormat.value; | |
| let content; | |
| if (file.type.includes('pdf')) { | |
| content = await extractTextFromPDF(file); | |
| } else if (file.type.includes('word') || file.type.includes('document') || | |
| file.name.endsWith('.docx') || file.name.endsWith('.doc')) { | |
| content = await extractTextFromWord(file); | |
| } else if (file.type.includes('excel') || file.type.includes('spreadsheet') || | |
| file.name.endsWith('.xlsx') || file.name.endsWith('.xls')) { | |
| content = await extractTextFromExcel(file); | |
| } else if (file.type.includes('image')) { | |
| content = await extractTextFromImage(file); | |
| } else { | |
| throw new Error('Unsupported file type: ' + file.type); | |
| } | |
| // Convert content to requested format | |
| let formattedContent; | |
| if (format === 'json') { | |
| formattedContent = { | |
| fileName: file.name, | |
| fileType: file.type, | |
| fileSize: file.size, | |
| content: content, | |
| extractedAt: new Date().toISOString() | |
| }; | |
| formattedContent = JSON.stringify(formattedContent, null, 2); | |
| } else if (format === 'markdown') { | |
| formattedContent = `# ${file.name}\n\n`; | |
| if (typeof content === 'string') { | |
| formattedContent += content; | |
| } else { | |
| formattedContent += JSON.stringify(content, null, 2) | |
| .replace(/\n/g, '\n\n') | |
| .replace(/"([^"]+)":/g, '**$1**:'); | |
| } | |
| } else { | |
| // Plain text | |
| if (typeof content === 'string') { | |
| formattedContent = content; | |
| } else { | |
| formattedContent = JSON.stringify(content, null, 2); | |
| } | |
| } | |
| return { | |
| fileName: file.name, | |
| content: formattedContent, | |
| format: format | |
| }; | |
| } | |
| async function extractTextFromPDF(file) { | |
| return new Promise((resolve, reject) => { | |
| const reader = new FileReader(); | |
| reader.onload = async function(event) { | |
| try { | |
| const typedArray = new Uint8Array(event.target.result); | |
| const pdf = await pdfjsLib.getDocument(typedArray).promise; | |
| let text = ''; | |
| for (let i = 1; i <= pdf.numPages; i++) { | |
| const page = await pdf.getPage(i); | |
| const content = await page.getTextContent(); | |
| const strings = content.items.map(item => item.str); | |
| text += strings.join(' ') + '\n\n'; | |
| } | |
| resolve(text); | |
| } catch (error) { | |
| reject(error); | |
| } | |
| }; | |
| reader.onerror = reject; | |
| reader.readAsArrayBuffer(file); | |
| }); | |
| } | |
| async function extractTextFromWord(file) { | |
| return new Promise((resolve, reject) => { | |
| const reader = new FileReader(); | |
| reader.onload = function(event) { | |
| mammoth.extractRawText({ arrayBuffer: event.target.result }) | |
| .then(function(result) { | |
| resolve(result.value); | |
| }) | |
| .catch(reject); | |
| }; | |
| reader.onerror = reject; | |
| reader.readAsArrayBuffer(file); | |
| }); | |
| } | |
| async function extractTextFromExcel(file) { | |
| return new Promise((resolve, reject) => { | |
| const reader = new FileReader(); | |
| reader.onload = function(event) { | |
| try { | |
| const data = new Uint8Array(event.target.result); | |
| const workbook = XLSX.read(data, { type: 'array' }); | |
| const result = {}; | |
| workbook.SheetNames.forEach(sheetName => { | |
| const worksheet = workbook.Sheets[sheetName]; | |
| result[sheetName] = XLSX.utils.sheet_to_json(worksheet, { header: 1 }); | |
| }); | |
| resolve(result); | |
| } catch (error) { | |
| reject(error); | |
| } | |
| }; | |
| reader.onerror = reject; | |
| reader.readAsArrayBuffer(file); | |
| }); | |
| } | |
| async function extractTextFromImage(file) { | |
| return new Promise((resolve, reject) => { | |
| Tesseract.recognize( | |
| file, | |
| 'tur+eng', // Turkish + English languages | |
| { | |
| logger: m => console.log(m), | |
| preserve_interword_spaces: true, | |
| tessedit_pageseg_mode: 6, // Assume a single uniform block of text | |
| tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ', | |
| tessedit_create_hocr: 1 // Include formatting info | |
| } | |
| ).then(({ data: { text, hocr } }) => { | |
| if (outputFormat.value === 'formatted') { | |
| // Process formatted output similar to Adobe/Abbyy | |
| const formatted = processFormattedOCR(hocr); | |
| resolve(formatted); | |
| } else { | |
| resolve(text); | |
| } | |
| }).catch(reject); | |
| }); | |
| function processFormattedOCR(hocr) { | |
| // Parse hOCR output to preserve formatting and layout | |
| const parser = new DOMParser(); | |
| const doc = parser.parseFromString(hocr, 'text/html'); | |
| const paragraphs = doc.querySelectorAll('.ocr_par'); | |
| let formattedText = ''; | |
| paragraphs.forEach(par => { | |
| const lines = par.querySelectorAll('.ocr_line'); | |
| lines.forEach(line => { | |
| const words = line.querySelectorAll('.ocrx_word'); | |
| let lineText = ''; | |
| words.forEach((word, index) => { | |
| const wordText = word.textContent || ''; | |
| const wordConfidence = parseFloat(word.getAttribute('title') | |
| .match(/x_wconf (\d+)/)[1]); | |
| // Apply formatting based on confidence and context | |
| if (wordConfidence < 60) { | |
| lineText += `[${wordText}] `; | |
| } else if (wordConfidence < 80 && index > 0) { | |
| lineText += `${wordText}`; | |
| } else { | |
| lineText += `${wordText} `; | |
| } | |
| }); | |
| formattedText += lineText.trim() + '\n'; | |
| }); | |
| formattedText += '\n'; | |
| }); | |
| return formattedText; | |
| } | |
| } | |
| function displayResult(result) { | |
| const resultCard = document.createElement('div'); | |
| resultCard.className = 'bg-gray-50 rounded-lg p-4 shadow-sm'; | |
| const header = document.createElement('div'); | |
| header.className = 'flex justify-between items-center mb-3'; | |
| const title = document.createElement('h3'); | |
| title.className = 'font-semibold text-lg text-gray-800 truncate'; | |
| title.textContent = result.fileName; | |
| const downloadBtn = document.createElement('button'); | |
| downloadBtn.className = 'bg-indigo-600 hover:bg-indigo-700 text-white font-medium py-1 px-3 rounded transition duration-200 flex items-center text-sm'; | |
| downloadBtn.innerHTML = '<i data-feather="download" class="mr-1"></i> Download'; | |
| downloadBtn.addEventListener('click', () => downloadResult(result)); | |
| header.appendChild(title); | |
| header.appendChild(downloadBtn); | |
| const content = document.createElement('div'); | |
| if (result.format === 'json') { | |
| const pre = document.createElement('pre'); | |
| pre.textContent = result.content; | |
| content.appendChild(pre); | |
| } else { | |
| const pre = document.createElement('pre'); | |
| pre.textContent = result.content; | |
| content.appendChild(pre); | |
| } | |
| resultCard.appendChild(header); | |
| resultCard.appendChild(content); | |
| resultsContainer.appendChild(resultCard); | |
| feather.replace(); | |
| } | |
| function downloadResult(result) { | |
| const blob = new Blob([result.content], { type: 'text/plain' }); | |
| const url = URL.createObjectURL(blob); | |
| const a = document.createElement('a'); | |
| a.href = url; | |
| a.download = `${result.fileName.split('.')[0]}.${result.format}`; | |
| document.body.appendChild(a); | |
| a.click(); | |
| document.body.removeChild(a); | |
| URL.revokeObjectURL(url); | |
| } | |
| downloadAllBtn.addEventListener('click', () => { | |
| processedResults.forEach(result => { | |
| downloadResult(result); | |
| }); | |
| }); | |
| }); |