docextractor-pro / script.js
ASDAD34's picture
Yapamıyor Türkçe ocr yapacak aynı abbyfine redaer format çeviri gibi Adobe redaer DC ocr format çevirisi gibi çalışmalı
3a12466 verified
raw
history blame
14 kB
document.addEventListener('DOMContentLoaded', function() {
const uploadBtn = document.getElementById('uploadBtn');
const fileInput = document.getElementById('fileInput');
const filePreviewList = document.getElementById('filePreviewList');
const filePreviewContainer = document.getElementById('filePreviewContainer');
const processBtn = document.getElementById('processBtn');
const outputFormat = document.getElementById('outputFormat');
const resultsContainer = document.getElementById('resultsContainer');
const resultsSection = document.getElementById('resultsSection');
const downloadAllBtn = document.getElementById('downloadAllBtn');
let files = [];
let processedResults = [];
// Set PDF.js worker path
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.11.338/pdf.worker.min.js';
// Handle file selection
uploadBtn.addEventListener('click', () => fileInput.click());
fileInput.addEventListener('change', handleFileSelection);
function handleFileSelection(e) {
files = Array.from(e.target.files);
filePreviewList.innerHTML = '';
if (files.length === 0) {
filePreviewContainer.classList.add('hidden');
return;
}
files.forEach((file, index) => {
const filePreview = createFilePreview(file, index);
filePreviewList.appendChild(filePreview);
});
filePreviewContainer.classList.remove('hidden');
}
function createFilePreview(file, index) {
const card = document.createElement('div');
card.className = 'file-card bg-gray-50 rounded-lg p-4 flex items-center justify-between';
const fileInfo = document.createElement('div');
fileInfo.className = 'flex items-center';
const icon = document.createElement('div');
icon.className = 'bg-gray-200 p-2 rounded-full mr-3';
const fileIcon = document.createElement('i');
fileIcon.dataset.feather = getFileIcon(file);
icon.appendChild(fileIcon);
const fileName = document.createElement('span');
fileName.className = 'font-medium text-gray-800';
fileName.textContent = file.name;
fileInfo.appendChild(icon);
fileInfo.appendChild(fileName);
const fileSize = document.createElement('span');
fileSize.className = 'text-gray-500 text-sm';
fileSize.textContent = formatFileSize(file.size);
card.appendChild(fileInfo);
card.appendChild(fileSize);
feather.replace();
return card;
}
function getFileIcon(file) {
if (file.type.includes('pdf')) return 'file';
if (file.type.includes('word') || file.type.includes('document')) return 'file-text';
if (file.type.includes('excel') || file.type.includes('spreadsheet')) return 'file-text';
if (file.type.includes('image')) return 'image';
return 'file';
}
function formatFileSize(bytes) {
if (bytes === 0) return '0 Bytes';
const k = 1024;
const sizes = ['Bytes', 'KB', 'MB', 'GB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
}
// Process files
processBtn.addEventListener('click', async function() {
if (files.length === 0) {
alert('Please select at least one file');
return;
}
resultsContainer.innerHTML = '';
processedResults = [];
processBtn.disabled = true;
processBtn.innerHTML = '<i data-feather="loader" class="spinner mr-2"></i> Processing...';
feather.replace();
try {
for (const file of files) {
const result = await processFile(file);
processedResults.push(result);
displayResult(result);
}
resultsSection.classList.remove('hidden');
} catch (error) {
console.error('Error processing files:', error);
alert('An error occurred while processing files: ' + error.message);
} finally {
processBtn.disabled = false;
processBtn.innerHTML = '<i data-feather="cpu" class="mr-2"></i> Process Files';
feather.replace();
}
});
async function processFile(file) {
const format = outputFormat.value;
let content;
if (file.type.includes('pdf')) {
content = await extractTextFromPDF(file);
} else if (file.type.includes('word') || file.type.includes('document') ||
file.name.endsWith('.docx') || file.name.endsWith('.doc')) {
content = await extractTextFromWord(file);
} else if (file.type.includes('excel') || file.type.includes('spreadsheet') ||
file.name.endsWith('.xlsx') || file.name.endsWith('.xls')) {
content = await extractTextFromExcel(file);
} else if (file.type.includes('image')) {
content = await extractTextFromImage(file);
} else {
throw new Error('Unsupported file type: ' + file.type);
}
// Convert content to requested format
let formattedContent;
if (format === 'json') {
formattedContent = {
fileName: file.name,
fileType: file.type,
fileSize: file.size,
content: content,
extractedAt: new Date().toISOString()
};
formattedContent = JSON.stringify(formattedContent, null, 2);
} else if (format === 'markdown') {
formattedContent = `# ${file.name}\n\n`;
if (typeof content === 'string') {
formattedContent += content;
} else {
formattedContent += JSON.stringify(content, null, 2)
.replace(/\n/g, '\n\n')
.replace(/"([^"]+)":/g, '**$1**:');
}
} else {
// Plain text
if (typeof content === 'string') {
formattedContent = content;
} else {
formattedContent = JSON.stringify(content, null, 2);
}
}
return {
fileName: file.name,
content: formattedContent,
format: format
};
}
async function extractTextFromPDF(file) {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = async function(event) {
try {
const typedArray = new Uint8Array(event.target.result);
const pdf = await pdfjsLib.getDocument(typedArray).promise;
let text = '';
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const content = await page.getTextContent();
const strings = content.items.map(item => item.str);
text += strings.join(' ') + '\n\n';
}
resolve(text);
} catch (error) {
reject(error);
}
};
reader.onerror = reject;
reader.readAsArrayBuffer(file);
});
}
async function extractTextFromWord(file) {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = function(event) {
mammoth.extractRawText({ arrayBuffer: event.target.result })
.then(function(result) {
resolve(result.value);
})
.catch(reject);
};
reader.onerror = reject;
reader.readAsArrayBuffer(file);
});
}
async function extractTextFromExcel(file) {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = function(event) {
try {
const data = new Uint8Array(event.target.result);
const workbook = XLSX.read(data, { type: 'array' });
const result = {};
workbook.SheetNames.forEach(sheetName => {
const worksheet = workbook.Sheets[sheetName];
result[sheetName] = XLSX.utils.sheet_to_json(worksheet, { header: 1 });
});
resolve(result);
} catch (error) {
reject(error);
}
};
reader.onerror = reject;
reader.readAsArrayBuffer(file);
});
}
async function extractTextFromImage(file) {
return new Promise((resolve, reject) => {
Tesseract.recognize(
file,
'tur+eng', // Turkish + English languages
{
logger: m => console.log(m),
preserve_interword_spaces: true,
tessedit_pageseg_mode: 6, // Assume a single uniform block of text
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ',
tessedit_create_hocr: 1 // Include formatting info
}
).then(({ data: { text, hocr } }) => {
if (outputFormat.value === 'formatted') {
// Process formatted output similar to Adobe/Abbyy
const formatted = processFormattedOCR(hocr);
resolve(formatted);
} else {
resolve(text);
}
}).catch(reject);
});
function processFormattedOCR(hocr) {
// Parse hOCR output to preserve formatting and layout
const parser = new DOMParser();
const doc = parser.parseFromString(hocr, 'text/html');
const paragraphs = doc.querySelectorAll('.ocr_par');
let formattedText = '';
paragraphs.forEach(par => {
const lines = par.querySelectorAll('.ocr_line');
lines.forEach(line => {
const words = line.querySelectorAll('.ocrx_word');
let lineText = '';
words.forEach((word, index) => {
const wordText = word.textContent || '';
const wordConfidence = parseFloat(word.getAttribute('title')
.match(/x_wconf (\d+)/)[1]);
// Apply formatting based on confidence and context
if (wordConfidence < 60) {
lineText += `[${wordText}] `;
} else if (wordConfidence < 80 && index > 0) {
lineText += `${wordText}`;
} else {
lineText += `${wordText} `;
}
});
formattedText += lineText.trim() + '\n';
});
formattedText += '\n';
});
return formattedText;
}
}
function displayResult(result) {
const resultCard = document.createElement('div');
resultCard.className = 'bg-gray-50 rounded-lg p-4 shadow-sm';
const header = document.createElement('div');
header.className = 'flex justify-between items-center mb-3';
const title = document.createElement('h3');
title.className = 'font-semibold text-lg text-gray-800 truncate';
title.textContent = result.fileName;
const downloadBtn = document.createElement('button');
downloadBtn.className = 'bg-indigo-600 hover:bg-indigo-700 text-white font-medium py-1 px-3 rounded transition duration-200 flex items-center text-sm';
downloadBtn.innerHTML = '<i data-feather="download" class="mr-1"></i> Download';
downloadBtn.addEventListener('click', () => downloadResult(result));
header.appendChild(title);
header.appendChild(downloadBtn);
const content = document.createElement('div');
if (result.format === 'json') {
const pre = document.createElement('pre');
pre.textContent = result.content;
content.appendChild(pre);
} else {
const pre = document.createElement('pre');
pre.textContent = result.content;
content.appendChild(pre);
}
resultCard.appendChild(header);
resultCard.appendChild(content);
resultsContainer.appendChild(resultCard);
feather.replace();
}
function downloadResult(result) {
const blob = new Blob([result.content], { type: 'text/plain' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `${result.fileName.split('.')[0]}.${result.format}`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
downloadAllBtn.addEventListener('click', () => {
processedResults.forEach(result => {
downloadResult(result);
});
});
});