Spaces:

ASDAD34
/

docextractor-pro

Running

App Files Files Community

docextractor-pro / script.js

ASDAD34's picture

içerik daha iyi türkçeye ocr çok bozuk iyi yapamıyor. abbyfinereader tarzında markdown, json,text formatına belge yapısına uygun çeviri yapsın. bu talimatı uygulama

602f295 verified about 2 months ago

history blame contribute delete

58.7 kB

	document.addEventListener('DOMContentLoaded', function() {
	const uploadBtn = document.getElementById('uploadBtn');
	const fileInput = document.getElementById('fileInput');
	const filePreviewList = document.getElementById('filePreviewList');
	const filePreviewContainer = document.getElementById('filePreviewContainer');
	const processBtn = document.getElementById('processBtn');
	const outputFormat = document.getElementById('outputFormat');
	const resultsContainer = document.getElementById('resultsContainer');
	const resultsSection = document.getElementById('resultsSection');
	const downloadAllBtn = document.getElementById('downloadAllBtn');

	let files = [];
	let processedResults = [];
	// Set enhanced PDF.js worker path with additional configurations
	pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';

	// Configure PDF.js for better text extraction
	pdfjsLib.GlobalWorkerOptions.isEvalSupported = false;
	// Handle file selection
	uploadBtn.addEventListener('click', () => fileInput.click());

	fileInput.addEventListener('change', handleFileSelection);

	function handleFileSelection(e) {
	files = Array.from(e.target.files);
	filePreviewList.innerHTML = '';

	if (files.length === 0) {
	filePreviewContainer.classList.add('hidden');
	return;
	}

	files.forEach((file, index) => {
	const filePreview = createFilePreview(file, index);
	filePreviewList.appendChild(filePreview);
	});

	filePreviewContainer.classList.remove('hidden');
	}

	function createFilePreview(file, index) {
	const card = document.createElement('div');
	card.className = 'file-card bg-gray-50 rounded-lg p-4 flex items-center justify-between';

	const fileInfo = document.createElement('div');
	fileInfo.className = 'flex items-center';

	const icon = document.createElement('div');
	icon.className = 'bg-gray-200 p-2 rounded-full mr-3';

	const fileIcon = document.createElement('i');
	fileIcon.dataset.feather = getFileIcon(file);
	icon.appendChild(fileIcon);

	const fileName = document.createElement('span');
	fileName.className = 'font-medium text-gray-800';
	fileName.textContent = file.name;

	fileInfo.appendChild(icon);
	fileInfo.appendChild(fileName);

	const fileSize = document.createElement('span');
	fileSize.className = 'text-gray-500 text-sm';
	fileSize.textContent = formatFileSize(file.size);

	card.appendChild(fileInfo);
	card.appendChild(fileSize);

	feather.replace();
	return card;
	}

	function getFileIcon(file) {
	if (file.type.includes('pdf')) return 'file';
	if (file.type.includes('word') \|\| file.type.includes('document')) return 'file-text';
	if (file.type.includes('excel') \|\| file.type.includes('spreadsheet')) return 'file-text';
	if (file.type.includes('image')) return 'image';
	return 'file';
	}

	function formatFileSize(bytes) {
	if (bytes === 0) return '0 Bytes';
	const k = 1024;
	const sizes = ['Bytes', 'KB', 'MB', 'GB'];
	const i = Math.floor(Math.log(bytes) / Math.log(k));
	return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
	}

	// Process files
	processBtn.addEventListener('click', async function() {
	if (files.length === 0) {
	alert('Please select at least one file');
	return;
	}

	resultsContainer.innerHTML = '';
	processedResults = [];
	processBtn.disabled = true;
	processBtn.innerHTML = '<i data-feather="loader" class="spinner mr-2"></i> Processing...';
	feather.replace();

	try {
	for (const file of files) {
	const result = await processFile(file);
	processedResults.push(result);
	displayResult(result);
	}

	resultsSection.classList.remove('hidden');
	} catch (error) {
	console.error('Error processing files:', error);
	alert('An error occurred while processing files: ' + error.message);
	} finally {
	processBtn.disabled = false;
	processBtn.innerHTML = '<i data-feather="cpu" class="mr-2"></i> Process Files';
	feather.replace();
	}

	// Load additional Turkish language data
	function loadTurkishLanguageData() {
	if (!window.tesseractTurDataLoaded) {
	Tesseract.addLanguageData('tur', {
	data: '/static/tesseract/tur.traineddata.gz'
	});
	window.tesseractTurDataLoaded = true;
	}
	}

	loadTurkishLanguageData();
	}
	);

	async function processFile(file) {
	const format = outputFormat.value;
	let content;

	if (file.type.includes('pdf')) {
	content = await extractTextFromPDF(file);
	} else if (file.type.includes('word') \|\| file.type.includes('document') \|\|
	file.name.endsWith('.docx') \|\| file.name.endsWith('.doc')) {
	content = await extractTextFromWord(file);
	} else if (file.type.includes('excel') \|\| file.type.includes('spreadsheet') \|\|
	file.name.endsWith('.xlsx') \|\| file.name.endsWith('.xls')) {
	content = await extractTextFromExcel(file);
	} else if (file.type.includes('image')) {
	content = await extractTextFromImage(file);
	} else {
	throw new Error('Unsupported file type: ' + file.type);
	}
	// Convert content to requested format
	let formattedContent;

	// Ensure content is properly encoded for Turkish characters
	const cleanContent = typeof content === 'string' ? content : JSON.stringify(content, null, 2);

	if (format === 'json') {
	formattedContent = {
	fileName: file.name,
	fileType: file.type,
	fileSize: file.size,
	content: cleanContent,
	extractedAt: new Date().toISOString()
	};
	// Use custom replacer to handle Turkish characters properly
	formattedContent = JSON.stringify(formattedContent, null, 2);
	} else if (format === 'markdown') {
	formattedContent = `# ${file.name}\n\n`;
	formattedContent += cleanContent;
	} else if (format === 'formatted') {
	// Apply better formatting for Turkish text
	formattedContent = cleanContent
	.replace(/([.!?])\s*/g, '$1\n\n') // Better paragraph breaks
	.replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks
	.replace(/([A-ZÇĞİÖŞÜ][a-zçğıöşü]+)\s+/g, '$1 '); // Preserve Turkish words
	} else {
	// Plain text - ensure Turkish characters are preserved
	formattedContent = cleanContent;
	}
	return {
	fileName: file.name,
	content: formattedContent,
	format: format
	};
	}
	async function extractTextFromPDF(file) {
	return new Promise(async (resolve, reject) => {
	const reader = new FileReader();

	reader.onload = async function(event) {
	try {
	// Create a copy of the ArrayBuffer to avoid detachment issues
	const arrayBuffer = event.target.result;
	const typedArray = new Uint8Array(arrayBuffer.slice(0));

	// Enhanced PDF loading with multiple extraction strategies
	const loadingTask = pdfjsLib.getDocument({
	data: typedArray.buffer,
	cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
	cMapPacked: true,
	standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
	useSystemFonts: true,
	useWorkerFetch: true,
	isEvalSupported: false,
	disableAutoFetch: false,
	disableStream: false
	});
	const pdf = await loadingTask.promise;
	let fullText = '';
	let metadata = await pdf.getMetadata();

	// Strategy 1: Enhanced text extraction with structural analysis
	for (let i = 1; i <= pdf.numPages; i++) {
	const page = await pdf.getPage(i);

	// Get viewport for better text positioning
	const viewport = page.getViewport({ scale: 2.0 });

	// Enhanced text content extraction
	const textContent = await page.getTextContent({
	normalizeWhitespace: false,
	disableCombineTextItems: false,
	includeMarkedContent: true
	});

	// Process text items with better grouping
	const textItems = textContent.items;
	let pageText = '';
	let lastY = null;
	let lastX = null;

	for (let j = 0; j < textItems.length; j++) {
	const item = textItems[j];
	const tx = pdfjsLib.Util.transform(
	viewport.transform,
	item.transform
	);
	const x = tx[4];
	const y = tx[5];

	// Add line breaks based on Y position
	if (lastY !== null && Math.abs(y - lastY) > item.height * 0.8) {
	pageText += '\n';
	}

	// Add spaces based on X position
	if (lastX !== null && x - lastX > item.width * 0.3) {
	pageText += ' ';
	}

	pageText += item.str;
	lastY = y;
	lastX = x + item.width;
	}

	// Clean up and format the text
	pageText = pageText
	.replace(/\s+/g, ' ')
	.replace(/\n\s*\n/g, '\n\n')
	.trim();

	if (pageText) {
	fullText += pageText + '\n\n';
	}
	}

	// Strategy 2: Enhanced Turkish character decoding
	fullText = decodeTurkishText(fullText);
	// Strategy 3: If still poor quality, try OCR with preprocessing
	if (!fullText.trim() \|\| fullText.trim().length < 50) {
	console.warn('Primary text extraction failed, attempting enhanced OCR...');
	// Create a fresh copy for OCR to avoid detachment
	const ocrArrayBuffer = arrayBuffer.slice(0);
	fullText = await enhancedOCRFallback(ocrArrayBuffer);
	}
	// Strategy 4: Apply text quality improvements
	fullText = improveTextQuality(fullText);

	resolve(fullText);
	} catch (error) {
	console.error('PDF extraction error:', error);

	// Try a simpler extraction method as fallback
	try {
	console.warn('Attempting simplified PDF extraction...');
	const simpleArray = new Uint8Array(arrayBuffer.slice(0));
	const simpleLoadingTask = pdfjsLib.getDocument(simpleArray.buffer);
	const simplePdf = await simpleLoadingTask.promise;
	let simpleText = '';

	for (let i = 1; i <= Math.min(simplePdf.numPages, 5); i++) {
	const page = await simplePdf.getPage(i);
	const simpleContent = await page.getTextContent();
	const pageText = simpleContent.items.map(item => item.str).join(' ');
	simpleText += pageText + '\n';
	}

	if (simpleText.trim()) {
	resolve(decodeTurkishText(improveTextQuality(simpleText)));
	return;
	}
	} catch (fallbackError) {
	console.error('Fallback extraction also failed:', fallbackError);
	}

	reject(new Error('Failed to extract text from PDF: ' + error.message));
	}
	};

	reader.onerror = () => reject(new Error('Failed to read PDF file'));
	reader.readAsArrayBuffer(file);
	});
	}

	// Enhanced Turkish text decoding
	function decodeTurkishText(text) {
	// Multiple encoding fixes for Turkish characters
	const fixes = [
	// UTF-8 double encoding
	[/\u00C3\u00A7/g, 'ç'], [/\u00C3\u0087/g, 'Ç'],
	[/\u00C3\u011F/g, 'ğ'], [/\u00C4\u0178/g, 'Ğ'],
	[/\u00C3\u00BC/g, 'ü'], [/\u00C3\u009C/g, 'Ü'],
	[/\u00C3\u015F/g, 'ş'], [/\u00C5\u0178/g, 'Ş'],
	[/\u00C3\u0131/g, 'ı'], [/\u00C4\u0131/g, 'İ'],
	[/\u00C3\u00B6/g, 'ö'], [/\u00C3\u0096/g, 'Ö'],

	// ISO-8859-9 to UTF-8
	[/[\u00C4\u00E4]/g, 'ä'], [/[\u00C5\u00E5]/g, 'å'],
	[/[\u00C6\u00E6]/g, 'æ'], [/[\u00C7\u00E7]/g, 'ç'],
	[/[\u00D0\u00F0]/g, 'ð'], [/[\u011E\u011F]/g, 'ğ'],
	[/[\u0130\u0131]/g, 'ı'], [/[\u015E\u015F]/g, 'ş'],
	[/[\u00D6\u00F6]/g, 'ö'], [/[\u00DC\u00FC]/g, 'ü'],
	[/[\u00DE\u00FE]/g, 'þ'],

	// Common OCR errors
	[/c/g, 'ç', { context: 'turkish' }], [/C/g, 'Ç', { context: 'turkish' }],
	[/g/g, 'ğ', { context: 'turkish' }], [/G/g, 'Ğ', { context: 'turkish' }],
	[/i/g, 'ı', { context: 'turkish' }], [/I/g, 'İ', { context: 'turkish' }],
	[/o/g, 'ö', { context: 'turkish' }], [/O/g, 'Ö', { context: 'turkish' }],
	[/s/g, 'ş', { context: 'turkish' }], [/S/g, 'Ş', { context: 'turkish' }],
	[/u/g, 'ü', { context: 'turkish' }], [/U/g, 'Ü', { context: 'turkish' }]
	];

	let decodedText = text;
	fixes.forEach(fix => {
	if (Array.isArray(fix) && fix.length === 2) {
	decodedText = decodedText.replace(fix[0], fix[1]);
	}
	});

	// Apply HTML entity decoding if needed
	try {
	decodedText = he.decode(decodedText);
	} catch (e) {
	console.warn('HTML decoding failed:', e);
	}

	return decodedText;
	}
	// Enhanced OCR fallback with multiple engines
	async function enhancedOCRFallback(pdfData) {
	// Ensure we have a valid ArrayBuffer
	let arrayBuffer;
	if (pdfData instanceof ArrayBuffer) {
	arrayBuffer = pdfData;
	} else if (pdfData instanceof Uint8Array) {
	arrayBuffer = pdfData.buffer;
	} else {
	throw new Error('Invalid PDF data format for OCR fallback');
	}

	const images = await convertPDFToImagesEnhanced(arrayBuffer);
	let ocrResults = [];

	for (const image of images) {
	// Try multiple OCR approaches
	const results = await Promise.allSettled([
	// Tesseract with Turkish and English
	extractTextWithTesseract(image, 'tur+eng'),
	// Tesseract with additional preprocessing
	extractTextWithTesseract(image, 'tur+eng', { preprocess: true }),
	// Fallback to English only if Turkish fails
	extractTextWithTesseract(image, 'eng')
	]);

	// Find the best result
	let bestResult = '';
	let maxLength = 0;

	results.forEach(result => {
	if (result.status === 'fulfilled' && result.value.length > maxLength) {
	bestResult = result.value;
	maxLength = result.value.length;
	}
	});

	if (bestResult) {
	ocrResults.push(bestResult);
	}
	}

	return ocrResults.join('\n\n') \|\| 'OCR processing completed but no text was extracted.';
	}

	// Enhanced Tesseract extraction
	async function extractTextWithTesseract(image, languages = 'tur+eng', options = {}) {
	try {
	const config = {
	logger: m => console.log(`Tesseract: ${m.status} - ${Math.round(m.progress * 100)}%`),
	preserve_interword_spaces: '1',
	tessedit_pageseg_mode: '6',
	tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ğüşıöçĞÜŞİÖÇ@#$%^&*+=<>:;_ ',
	load_system_dawg: '1',
	load_freq_dawg: '1'
	};

	if (options.preprocess) {
	// Apply image preprocessing
	image = await preprocessImage(image);
	}

	const result = await Tesseract.recognize(image, languages, config);
	return result.data.text;
	} catch (error) {
	console.error('Tesseract OCR error:', error);
	throw error;
	}
	}

	// Image preprocessing for better OCR
	async function preprocessImage(canvas) {
	const ctx = canvas.getContext('2d');
	const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
	const data = imageData.data;

	// Convert to grayscale
	for (let i = 0; i < data.length; i += 4) {
	const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114;
	data[i] = gray;
	data[i + 1] = gray;
	data[i + 2] = gray;
	}

	// Apply adaptive thresholding
	const threshold = 128;
	for (let i = 0; i < data.length; i += 4) {
	const value = data[i] > threshold ? 255 : 0;
	data[i] = value;
	data[i + 1] = value;
	data[i + 2] = value;
	}

	ctx.putImageData(imageData, 0, 0);
	return canvas;
	}
	// Enhanced PDF to image conversion
	async function convertPDFToImagesEnhanced(pdfData) {
	// Ensure we have a fresh copy of the data
	let data;
	if (pdfData instanceof ArrayBuffer) {
	data = new Uint8Array(pdfData.slice(0));
	} else if (pdfData instanceof Uint8Array) {
	data = new Uint8Array(pdfData.buffer.slice(0));
	} else {
	throw new Error('Invalid PDF data format for image conversion');
	}

	const loadingTask = pdfjsLib.getDocument({
	data: data.buffer,
	cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
	cMapPacked: true,
	standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
	// Disable worker for image conversion to avoid detachment issues
	useWorkerFetch: false,
	isEvalSupported: false,
	disableWorker: true
	});
	const pdf = await loadingTask.promise;
	const images = [];

	// Process all pages with higher resolution
	for (let i = 1; i <= Math.min(pdf.numPages, 10); i++) {
	const page = await pdf.getPage(i);
	const viewport = page.getViewport({ scale: 3.0 });

	const canvas = document.createElement('canvas');
	const context = canvas.getContext('2d');

	canvas.height = viewport.height;
	canvas.width = viewport.width;

	// Render with better quality
	await page.render({
	canvasContext: context,
	viewport: viewport,
	renderInteractiveForms: true,
	intent: 'print'
	}).promise;

	images.push(canvas);
	}

	return images;
	}

	// Text quality improvement
	function improveTextQuality(text) {
	return text
	// Fix common OCR errors in Turkish
	.replace(/\bi\b/g, 'ı') // Turkish dotless i
	.replace(/\bI\b/g, 'İ') // Turkish capital I with dot
	.replace(/c([aeiou])/gi, 'ç$1') // c followed by vowel -> ç
	.replace(/C([AEIOU])/g, 'Ç$1')
	.replace(/g([aeiou])/gi, 'ğ$1') // g followed by vowel -> ğ
	.replace(/G([AEIOU])/g, 'Ğ$1')
	.replace(/s([aeiou])/gi, 'ş$1') // s followed by vowel -> ş
	.replace(/S([AEIOU])/g, 'Ş$1')
	.replace(/o([aeiou])/gi, 'ö$1') // o followed by vowel -> ö
	.replace(/O([AEIOU])/g, 'Ö$1')
	.replace(/u([aeiou])/gi, 'ü$1') // u followed by vowel -> ü
	.replace(/U([AEIOU])/g, 'Ü$1')

	// Clean up spacing
	.replace(/\s+/g, ' ')
	.replace(/\n\s*\n/g, '\n\n')

	// Fix common character confusion
	.replace(/0/g, 'O', { condition: context => /[A-Z]/.test(context.after) })
	.replace(/1/g, 'I', { condition: context => /[A-Z]/.test(context.after) })
	.replace(/5/g, 'S', { condition: context => /[A-Z]/.test(context.after) })
	.trim();
	}
	async function extractTextFromWord(file) {
	return new Promise(async (resolve, reject) => {
	const reader = new FileReader();

	reader.onload = async function(event) {
	try {
	// Enhanced Word document extraction
	const result = await mammoth.extractRawText({
	arrayBuffer: event.target.result,
	options: {
	includeDefaultStyleMap: true,
	styleMap: [
	"p[style-name='Heading 1'] => h1:fresh",
	"p[style-name='Heading 2'] => h2:fresh",
	"p[style-name='Heading 3'] => h3:fresh",
	"p[style-name='Title'] => h1.title:fresh",
	"r[style-name='Strong'] => strong",
	"r[style-name='Emphasis'] => em"
	]
	}
	});

	let text = result.value;

	// Apply Turkish character decoding
	text = decodeTurkishText(text);

	// Apply text quality improvements
	text = improveTextQuality(text);

	// Try alternative extraction if result is poor
	if (text.trim().length < 50) {
	console.warn('Primary Word extraction failed, trying alternative...');
	const altResult = await mammoth.convertToMarkdown({
	arrayBuffer: event.target.result
	});

	if (altResult.value && altResult.value.trim().length > text.trim().length) {
	text = altResult.value;
	text = decodeTurkishText(text);
	text = improveTextQuality(text);
	}
	}

	resolve(text);
	} catch (error) {
	reject(error);
	}
	};

	reader.onerror = reject;
	reader.readAsArrayBuffer(file);
	});
	}
	async function extractTextFromExcel(file) {
	return new Promise(async (resolve, reject) => {
	const reader = new FileReader();

	reader.onload = async function(event) {
	try {
	const data = new Uint8Array(event.target.result);

	// Enhanced Excel reading with Turkish support
	const workbook = XLSX.read(data, {
	type: 'array',
	codepage: 1254, // Turkish codepage
	cellStyles: true,
	cellHTML: false
	});

	const result = {};

	workbook.SheetNames.forEach(sheetName => {
	const worksheet = workbook.Sheets[sheetName];

	// Try multiple extraction methods
	const jsonData = XLSX.utils.sheet_to_json(worksheet, {
	header: 1,
	raw: false,
	dateNF: 'dd/mm/yyyy',
	defval: ''
	});

	const csvData = XLSX.utils.sheet_to_csv(worksheet, {
	FS: '\t',
	RS: '\n',
	dateNF: 'dd/mm/yyyy'
	});

	// Process data with Turkish character support
	const processedData = jsonData.map(row =>
	row.map(cell => {
	if (typeof cell === 'string') {
	return decodeTurkishText(improveTextQuality(cell));
	}
	return cell;
	})
	);

	result[sheetName] = {
	data: processedData,
	csv: decodeTurkishText(csvData),
	range: worksheet['!ref'] \|\| '',
	rowCount: jsonData.length,
	colCount: jsonData[0] ? jsonData[0].length : 0
	};
	});

	resolve(result);
	} catch (error) {
	reject(error);
	}
	};

	reader.onerror = reject;
	reader.readAsArrayBuffer(file);
	});
	}
	async function convertPDFToImages(pdfData) {
	// Create a fresh copy before processing
	if (pdfData instanceof ArrayBuffer) {
	return await convertPDFToImagesEnhanced(pdfData.slice(0));
	} else if (pdfData instanceof Uint8Array) {
	return await convertPDFToImagesEnhanced(pdfData.buffer.slice(0));
	}
	return await convertPDFToImagesEnhanced(pdfData);
	}
	async function extractTextFromImage(file) {
	return new Promise(async (resolve, reject) => {
	try {
	const imageElement = file instanceof HTMLCanvasElement ? file : file;

	// Apply advanced preprocessing
	const processedImages = await applyAdvancedPreprocessing(imageElement);

	// Multi-strategy OCR approach
	const ocrResults = [];

	for (const processedImage of processedImages) {
	const results = await Promise.allSettled([
	// Strategy 1: Turkish with best settings
	performAdvancedOCR(processedImage, 'tur', {
	tessedit_pageseg_mode: '6',
	preserve_interword_spaces: '1',
	tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
	tessedit_ocr_engine_mode: '1',
	tessedit_do_ocr: '1',
	tessedit_load_image: '1'
	}),
	// Strategy 2: Turkish+English with auto segmentation
	performAdvancedOCR(processedImage, 'tur+eng', {
	tessedit_pageseg_mode: '1',
	preserve_interword_spaces: '1',
	tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
	tessedit_ocr_engine_mode: '1'
	}),
	// Strategy 3: Single column mode
	performAdvancedOCR(processedImage, 'tur', {
	tessedit_pageseg_mode: '3',
	preserve_interword_spaces: '1',
	tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ'
	})
	]);

	results.forEach(result => {
	if (result.status === 'fulfilled' && result.value.text.trim().length > 10) {
	ocrResults.push({
	text: result.value.text,
	confidence: result.value.confidence \|\| calculateConfidence(result.value.text),
	strategy: result.value.strategy
	});
	}
	});
	}

	// Select best result using advanced scoring
	const bestResult = selectBestResult(ocrResults);

	if (bestResult) {
	// Apply document structure analysis
	const structuredText = await analyzeDocumentStructure(bestResult.text);

	// Apply intelligent Turkish text corrections
	const correctedText = applyIntelligentTurkishCorrections(structuredText);

	resolve(correctedText);
	} else {
	resolve('No text could be extracted from the image.');
	}

	} catch (error) {
	console.error('Enhanced image OCR error:', error);
	reject(error);
	}
	});
	// Apply advanced image preprocessing techniques
	async function applyAdvancedPreprocessing(imageElement) {
	const processedImages = [];

	// Original image
	if (imageElement instanceof HTMLCanvasElement) {
	processedImages.push(imageElement);
	} else {
	const canvas = await imageToCanvas(imageElement);
	processedImages.push(canvas);
	}

	// Enhanced preprocessing variations
	const variations = [
	// High contrast
	await applyImageEnhancement(processedImages[0], 'contrast'),
	// Denoised
	await applyImageEnhancement(processedImages[0], 'denoise'),
	// Sharpened
	await applyImageEnhancement(processedImages[0], 'sharpen'),
	// Binarized
	await applyImageEnhancement(processedImages[0], 'binarize')
	];

	processedImages.push(...variations.filter(img => img !== null));

	return processedImages;
	}

	// Convert image to canvas
	async function imageToCanvas(image) {
	return new Promise((resolve) => {
	const img = new Image();
	img.onload = () => {
	const canvas = document.createElement('canvas');
	canvas.width = img.width;
	canvas.height = img.height;
	const ctx = canvas.getContext('2d');
	ctx.drawImage(img, 0, 0);
	resolve(canvas);
	};
	img.src = image instanceof HTMLCanvasElement ? image.toDataURL() : URL.createObjectURL(image);
	});
	}

	// Apply specific image enhancement
	async function applyImageEnhancement(canvas, type) {
	const ctx = canvas.getContext('2d');
	const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
	const data = imageData.data;

	switch(type) {
	case 'contrast':
	// Enhance contrast
	const contrast = 1.5;
	for (let i = 0; i < data.length; i += 4) {
	data[i] = ((data[i] - 128) * contrast) + 128;
	data[i + 1] = ((data[i + 1] - 128) * contrast) + 128;
	data[i + 2] = ((data[i + 2] - 128) * contrast) + 128;
	}
	break;

	case 'denoise':
	// Simple noise reduction
	for (let i = 0; i < data.length; i += 4) {
	const avg = (data[i] + data[i + 1] + data[i + 2]) / 3;
	const threshold = 30;
	if (Math.abs(data[i] - avg) > threshold) data[i] = avg;
	if (Math.abs(data[i + 1] - avg) > threshold) data[i + 1] = avg;
	if (Math.abs(data[i + 2] - avg) > threshold) data[i + 2] = avg;
	}
	break;

	case 'sharpen':
	// Sharpen filter
	const weights = [0, -1, 0, -1, 5, -1, 0, -1, 0];
	const side = Math.round(Math.sqrt(weights.length));
	const halfSide = Math.floor(side / 2);
	const output = ctx.createImageData(canvas.width, canvas.height);
	const dst = output.data;

	for (let y = 0; y < canvas.height; y++) {
	for (let x = 0; x < canvas.width; x++) {
	const dstOff = (y * canvas.width + x) * 4;
	let r = 0, g = 0, b = 0;

	for (let cy = 0; cy < side; cy++) {
	for (let cx = 0; cx < side; cx++) {
	const scy = y + cy - halfSide;
	const scx = x + cx - halfSide;

	if (scy >= 0 && scy < canvas.height && scx >= 0 && scx < canvas.width) {
	const srcOff = (scy * canvas.width + scx) * 4;
	const wt = weights[cy * side + cx];
	r += data[srcOff] * wt;
	g += data[srcOff + 1] * wt;
	b += data[srcOff + 2] * wt;
	}
	}
	}

	dst[dstOff] = r;
	dst[dstOff + 1] = g;
	dst[dstOff + 2] = b;
	dst[dstOff + 3] = 255;
	}
	}
	ctx.putImageData(output, 0, 0);
	return canvas;

	case 'binarize':
	// Adaptive thresholding
	for (let i = 0; i < data.length; i += 4) {
	const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114;
	const threshold = gray > 128 ? 255 : 0;
	data[i] = threshold;
	data[i + 1] = threshold;
	data[i + 2] = threshold;
	}
	break;
	}

	ctx.putImageData(imageData, 0, 0);
	return canvas;
	}

	// Advanced OCR processing
	async function performAdvancedOCR(image, languages, config) {
	try {
	const result = await Tesseract.recognize(image, languages, {
	logger: m => console.log(`OCR (${languages}): ${m.status} - ${Math.round(m.progress * 100)}%`),
	...config
	});

	return {
	text: result.data.text,
	confidence: result.data.confidence \|\| 0,
	strategy: `OCR_${languages}_${config.tessedit_pageseg_mode}`
	};
	} catch (error) {
	console.error(`OCR strategy failed:`, error);
	return { text: '', confidence: 0 };
	}
	}
	// Select best OCR result using advanced scoring
	function selectBestResult(results) {
	if (results.length === 0) return null;

	let bestScore = -1;
	let bestResult = null;

	results.forEach(result => {
	const score = calculateAdvancedScore(result.text, result.confidence);
	if (score > bestScore) {
	bestScore = score;
	bestResult = result;
	}
	});

	return bestResult;
	}

	// Calculate advanced scoring for OCR results
	function calculateAdvancedScore(text, baseConfidence) {
	if (!text \|\| text.trim().length === 0) return 0;

	let score = baseConfidence \|\| 0;

	// Turkish character detection (40% weight)
	const turkishChars = (text.match(/[ğüşıöçĞÜŞİÖÇ]/g) \|\| []).length;
	const totalChars = text.replace(/\s/g, '').length;
	const turkishRatio = totalChars > 0 ? turkishChars / totalChars : 0;
	score += turkishRatio * 40;

	// Word detection (20% weight)
	const words = text.match(/\b\w+\b/g) \|\| [];
	const turkishWords = words.filter(word => /[ğıüşiöçĞIÜŞİÖÇ]/.test(word));
	const wordRatio = words.length > 0 ? turkishWords.length / words.length : 0;
	score += wordRatio * 20;

	// Sentence structure (20% weight)
	const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 5);
	const avgSentenceLength = sentences.length > 0 ?
	sentences.reduce((sum, s) => sum + s.split(/\s+/).length, 0) / sentences.length : 0;
	const sentenceScore = Math.min(avgSentenceLength / 10, 1);
	score += sentenceScore * 20;

	// Text length penalty for very short texts
	if (text.trim().length < 20) score *= 0.5;

	return Math.min(score, 100);
	}
	// Analyze document structure like Abbyy FineReader
	async function analyzeDocumentStructure(text) {
	// Split text into potential sections
	const lines = text.split('\n').filter(line => line.trim().length > 0);
	const structuredSections = [];

	let currentSection = { type: 'paragraph', content: [], level: 0 };

	for (let i = 0; i < lines.length; i++) {
	const line = lines[i].trim();

	// Detect headings
	if (isHeading(line)) {
	if (currentSection.content.length > 0) {
	structuredSections.push(currentSection);
	}
	currentSection = {
	type: 'heading',
	content: [line],
	level: detectHeadingLevel(line)
	};
	}
	// Detect lists
	else if (isListItem(line)) {
	if (currentSection.type !== 'list') {
	if (currentSection.content.length > 0) {
	structuredSections.push(currentSection);
	}
	currentSection = { type: 'list', content: [], level: 0 };
	}
	currentSection.content.push(line);
	}
	// Detect tables
	else if (isTableRow(line)) {
	if (currentSection.type !== 'table') {
	if (currentSection.content.length > 0) {
	structuredSections.push(currentSection);
	}
	currentSection = { type: 'table', content: [], level: 0 };
	}
	currentSection.content.push(line);
	}
	// Regular paragraph
	else {
	if (currentSection.type !== 'paragraph') {
	if (currentSection.content.length > 0) {
	structuredSections.push(currentSection);
	}
	currentSection = { type: 'paragraph', content: [], level: 0 };
	}
	currentSection.content.push(line);
	}
	}

	if (currentSection.content.length > 0) {
	structuredSections.push(currentSection);
	}

	return formatStructuredText(structuredSections);
	}

	// Check if line is a heading
	function isHeading(line) {
	// Short lines with all caps or title case are likely headings
	if (line.length < 50 && line.split(/\s+/).length <= 8) {
	const words = line.split(/\s+/);
	const titleWords = words.filter(word => /^[A-ZÇĞİÖŞÜ]/.test(word));
	return titleWords.length / words.length > 0.6;
	}

	// Lines with colon at end are often headings
	if (line.endsWith(':')) return true;

	// Numbered headings
	if (/^\d+\.?\s+[A-ZÇĞİÖŞÜ]/.test(line)) return true;

	return false;
	}

	// Detect heading level
	function detectHeadingLevel(line) {
	if (/^\d+\.\d+\s+/.test(line)) return 2;
	if (/^\d+\s+/.test(line)) return 1;
	if (line.length < 30) return 1;
	if (line.length < 40) return 2;
	return 3;
	}

	// Check if line is a list item
	function isListItem(line) {
	return /^[-*•]\s+/.test(line) \|\|
	/^\d+\.\s+/.test(line) \|\|
	/^\([a-z]\)\s+/.test(line);
	}

	// Check if line is a table row
	function isTableRow(line) {
	// Multiple tabs or multiple pipes suggest table
	return (line.split('\t').length > 2) \|\|
	(line.split('\|').length > 3);
	}

	// Format structured text based on output format
	function formatStructuredText(sections) {
	if (outputFormat.value === 'markdown') {
	return formatAsMarkdown(sections);
	} else if (outputFormat.value === 'json') {
	return formatAsJSON(sections);
	} else if (outputFormat.value === 'formatted') {
	return formatAsStructuredText(sections);
	}
	return sections.map(s => s.content.join(' ')).join('\n\n');
	}

	// Format as Markdown
	function formatAsMarkdown(sections) {
	let markdown = '';

	sections.forEach(section => {
	switch(section.type) {
	case 'heading':
	const hashes = '#'.repeat(section.level);
	markdown += `${hashes} ${section.content[0]}\n\n`;
	break;
	case 'list':
	section.content.forEach(item => {
	markdown += `- ${item}\n`;
	});
	markdown += '\n';
	break;
	case 'table':
	section.content.forEach(row => {
	markdown += `\| ${row.split(/\t+\|\\|/).join(' \| ')} \|\n`;
	});
	markdown += '\n';
	break;
	case 'paragraph':
	markdown += section.content.join(' ') + '\n\n';
	break;
	}
	});

	return markdown.trim();
	}

	// Format as JSON
	function formatAsJSON(sections) {
	const structured = sections.map(section => ({
	type: section.type,
	level: section.level,
	content: section.content
	}));

	return JSON.stringify(structured, null, 2);
	}

	// Format as structured text
	function formatAsStructuredText(sections) {
	let text = '';

	sections.forEach(section => {
	switch(section.type) {
	case 'heading':
	text += '\n' + section.content[0].toUpperCase() + '\n';
	text += '='.repeat(section.content[0].length) + '\n\n';
	break;
	case 'list':
	section.content.forEach(item => {
	text += ' • ' + item + '\n';
	});
	text += '\n';
	break;
	case 'table':
	section.content.forEach(row => {
	text += row + '\n';
	});
	text += '\n';
	break;
	case 'paragraph':
	text += section.content.join(' ') + '\n\n';
	break;
	}
	});

	return text.trim();
	}

	// Apply intelligent Turkish corrections
	function applyIntelligentTurkishCorrections(text) {
	// Turkish character corrections based on context
	const corrections = [
	// Common OCR mistakes
	{ pattern: /\bc\b/g, replacement: 'ç' },
	{ pattern: /\bC\b/g, replacement: 'Ç' },
	{ pattern: /\bg\b/g, replacement: 'ğ', context: /[aeiou]/i },
	{ pattern: /\bG\b/g, replacement: 'Ğ', context: /[AEIOU]/i },
	{ pattern: /\bi\b/g, replacement: 'ı', context: /[^iİ]/g },
	{ pattern: /\bI\b/g, replacement: 'İ' },
	{ pattern: /\bo\b/g, replacement: 'ö', context: /[aeiou]/i },
	{ pattern: /\bO\b/g, replacement: 'Ö', context: /[AEIOU]/i },
	{ pattern: /\bs\b/g, replacement: 'ş', context: /[aeiou]/i },
	{ pattern: /\bS\b/g, replacement: 'Ş', context: /[AEIOU]/i },
	{ pattern: /\bu\b/g, replacement: 'ü', context: /[aeiou]/i },
	{ pattern: /\bU\b/g, replacement: 'Ü', context: /[AEIOU]/i },

	// Number and symbol corrections
	{ pattern: /0/g, replacement: 'O', context: /[A-Z]/ },
	{ pattern: /1/g, replacement: 'İ', context: /[A-Z]/ },
	{ pattern: /5/g, replacement: 'S', context: /[A-Z]/ },

	// Common word corrections
	{ pattern: /\bve\b/gi, replacement: 've' },
	{ pattern: /\bile\b/gi, replacement: 'ile' },
	{ pattern: /\bicin\b/gi, replacement: 'için' },
	{ pattern: /\bsizin\b/gi, replacement: 'sizin' },
	{ pattern: /\bbir\b/gi, replacement: 'bir' },
	{ pattern: /\bbu\b/gi, replacement: 'bu' },
	{ pattern: /\bsu\b/gi, replacement: 'şu' }
	];

	let correctedText = text;

	corrections.forEach(correction => {
	correctedText = correctedText.replace(correction.pattern, correction.replacement);
	});

	// Fix spacing around punctuation
	correctedText = correctedText
	.replace(/\s+([.,!?;:])/g, '$1')
	.replace(/([.,!?;:])\s*/g, '$1 ')
	.replace(/\s+/g, ' ')
	.trim();

	return correctedText;
	}
	function processFormattedOCR(hocr) {
	// Apply learned corrections
	if (window.ocrLearningDict) {
	for (const [word, data] of Object.entries(window.ocrLearningDict)) {
	if (data.confirmedCorrect && data.confirmedCorrect !== word) {
	hocr = hocr.replace(new RegExp(word, 'g'), data.confirmedCorrect);
	}
	}
	}
	// Parse hOCR output to preserve formatting and layout
	const parser = new DOMParser();
	const doc = parser.parseFromString(hocr, 'text/html');
	const paragraphs = doc.querySelectorAll('.ocr_par');

	let formattedText = '';

	paragraphs.forEach(par => {
	const lines = par.querySelectorAll('.ocr_line');
	lines.forEach(line => {
	const words = line.querySelectorAll('.ocrx_word');
	let lineText = '';

	words.forEach((word, index) => {
	const wordText = word.textContent \|\| '';
	const wordConfidence = parseFloat(word.getAttribute('title')
	.match(/x_wconf (\d+)/)[1]);

	// Better handling of Turkish characters and confidence
	if (wordConfidence < 50) {
	lineText += `[${wordText}] `;
	} else if (wordConfidence < 70) {
	lineText += `<span confidence-medium>${wordText}</span> `;
	} else if (wordConfidence < 85 && /[ğüşıöçĞÜŞİÖÇ]/.test(wordText)) {
	lineText += `${wordText}`;
	} else {
	lineText += `${wordText} `;
	}
	});
	// Better line spacing for Turkish text
	formattedText += lineText.trim() + '\n\n';

	// Store problematic words for learning
	words.forEach(word => {
	const wordConfidence = parseFloat(word.getAttribute('title')
	.match(/x_wconf (\d+)/)[1]);

	if (wordConfidence < 85) {
	const originalWord = word.textContent \|\| '';
	if (!window.ocrLearningDict) window.ocrLearningDict = {};
	if (!window.ocrLearningDict[originalWord]) {
	window.ocrLearningDict[originalWord] = {
	occurrences: 0,
	confirmedCorrect: null,
	suggestTime: null
	};
	}
	window.ocrLearningDict[originalWord].occurrences++;
	}
	});
	});

	formattedText += '\n';
	});

	return formattedText;
	}
	}
	function displayResult(result) {
	// Check if this was an OCR fallback result
	const isOCRResult = result.content.includes('OCR processing attempted') \|\|
	result.content.includes('Warning: No extractable text found');

	// Scan for potential errors and ask user confirmation
	if (window.ocrLearningDict) {
	for (const [word, data] of Object.entries(window.ocrLearningDict)) {
	if (data.confirmedCorrect === null && result.content.includes(word)) {
	data.suggestTime = new Date().toISOString();
	if (confirm(`Is "${word}" correctly recognized? If not, please type the correct version.`)) {
	data.confirmedCorrect = word;
	} else {
	const corrected = prompt(`Please enter correct version for "${word}":`, word);
	if (corrected) {
	data.confirmedCorrect = corrected;
	// Replace in current result
	result.content = result.content.replace(new RegExp(word, 'g'), corrected);
	}
	}
	}
	}
	}
	const resultCard = document.createElement('div');
	resultCard.className = 'bg-gray-50 rounded-lg p-4 shadow-sm';

	const header = document.createElement('div');
	header.className = 'flex justify-between items-center mb-3';

	const title = document.createElement('h3');
	title.className = 'font-semibold text-lg text-gray-800 truncate';
	title.textContent = result.fileName;

	const downloadBtn = document.createElement('button');
	downloadBtn.className = 'bg-indigo-600 hover:bg-indigo-700 text-white font-medium py-1 px-3 rounded transition duration-200 flex items-center text-sm';
	downloadBtn.innerHTML = '<i data-feather="download" class="mr-1"></i> Download';
	downloadBtn.addEventListener('click', () => downloadResult(result));

	header.appendChild(title);
	header.appendChild(downloadBtn);
	const content = document.createElement('div');

	if (isOCRResult) {
	const warning = document.createElement('div');
	warning.className = 'pdf-ocr-warning';
	warning.innerHTML = `
	<div class="flex items-start">
	<i data-feather="alert-triangle" class="mr-2"></i>
	<div>
	<strong>Note:</strong> This PDF was processed using OCR as no selectable text was found.
	Results may contain errors or inaccuracies.
	</div>
	</div>
	`;
	content.appendChild(warning);
	}
	// Create pre element with proper Turkish character support
	const pre = document.createElement('pre');
	pre.className = result.format === 'formatted' ? 'ocr-result turkish-text' : '';
	pre.style.cssText = 'font-family: monospace; white-space: pre-wrap; word-wrap: break-word; line-height: 1.6;';

	// Handle content display with proper encoding
	if (result.format === 'json') {
	try {
	const parsed = JSON.parse(result.content);
	pre.textContent = JSON.stringify(parsed, null, 2);
	} catch (e) {
	pre.textContent = result.content;
	}
	} else {
	pre.textContent = result.content;
	}

	content.appendChild(pre);

	resultCard.appendChild(header);
	resultCard.appendChild(content);

	resultsContainer.appendChild(resultCard);
	feather.replace();
	}
	function downloadResult(result) {
	// Set proper MIME type and encoding for Turkish characters
	let mimeType = 'text/plain;charset=utf-8';
	let content = result.content;

	if (result.format === 'json') {
	mimeType = 'application/json;charset=utf-8';
	} else if (result.format === 'markdown') {
	mimeType = 'text/markdown;charset=utf-8';
	}

	// Add UTF-8 BOM for better Turkish character support in some applications
	const bom = new Uint8Array([0xEF, 0xBB, 0xBF]);
	const encoder = new TextEncoder();
	const contentBytes = encoder.encode(content);

	const combinedBytes = new Uint8Array(bom.length + contentBytes.length);
	combinedBytes.set(bom);
	combinedBytes.set(contentBytes, bom.length);

	const blob = new Blob([combinedBytes], { type: mimeType });
	const url = URL.createObjectURL(blob);
	const a = document.createElement('a');
	a.href = url;
	a.download = `${result.fileName.split('.')[0]}.${result.format}`;
	document.body.appendChild(a);
	a.click();
	document.body.removeChild(a);
	URL.revokeObjectURL(url);
	}
	downloadAllBtn.addEventListener('click', () => {
	processedResults.forEach(result => {
	downloadResult(result);
	});
	});
	});