Spaces:
Running
Running
İçeriğini ocr yapamıyor.
Browse files
script.js
CHANGED
|
@@ -114,7 +114,20 @@ document.addEventListener('DOMContentLoaded', function() {
|
|
| 114 |
processBtn.innerHTML = '<i data-feather="cpu" class="mr-2"></i> Process Files';
|
| 115 |
feather.replace();
|
| 116 |
}
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
async function processFile(file) {
|
| 120 |
const format = outputFormat.value;
|
|
@@ -249,9 +262,13 @@ async function extractTextFromImage(file) {
|
|
| 249 |
logger: m => console.log(m),
|
| 250 |
preserve_interword_spaces: true,
|
| 251 |
tessedit_pageseg_mode: 6, // Assume a single uniform block of text
|
| 252 |
-
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ',
|
| 253 |
-
tessedit_create_hocr: 1 // Include formatting info
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
).then(({ data: { text, hocr } }) => {
|
| 256 |
if (outputFormat.value === 'formatted') {
|
| 257 |
// Process formatted output similar to Adobe/Abbyy
|
|
@@ -262,7 +279,6 @@ async function extractTextFromImage(file) {
|
|
| 262 |
}
|
| 263 |
}).catch(reject);
|
| 264 |
});
|
| 265 |
-
|
| 266 |
function processFormattedOCR(hocr) {
|
| 267 |
// Parse hOCR output to preserve formatting and layout
|
| 268 |
const parser = new DOMParser();
|
|
@@ -282,24 +298,27 @@ async function extractTextFromImage(file) {
|
|
| 282 |
const wordConfidence = parseFloat(word.getAttribute('title')
|
| 283 |
.match(/x_wconf (\d+)/)[1]);
|
| 284 |
|
| 285 |
-
//
|
| 286 |
-
if (wordConfidence <
|
| 287 |
lineText += `[${wordText}] `;
|
| 288 |
-
} else if (wordConfidence <
|
|
|
|
|
|
|
| 289 |
lineText += `${wordText}`;
|
| 290 |
} else {
|
| 291 |
lineText += `${wordText} `;
|
| 292 |
}
|
| 293 |
});
|
| 294 |
|
| 295 |
-
|
|
|
|
| 296 |
});
|
| 297 |
|
| 298 |
formattedText += '\n';
|
| 299 |
});
|
| 300 |
|
| 301 |
return formattedText;
|
| 302 |
-
|
| 303 |
}
|
| 304 |
|
| 305 |
function displayResult(result) {
|
|
|
|
| 114 |
processBtn.innerHTML = '<i data-feather="cpu" class="mr-2"></i> Process Files';
|
| 115 |
feather.replace();
|
| 116 |
}
|
| 117 |
+
|
| 118 |
+
// Load additional Turkish language data
|
| 119 |
+
function loadTurkishLanguageData() {
|
| 120 |
+
if (!window.tesseractTurDataLoaded) {
|
| 121 |
+
Tesseract.addLanguageData('tur', {
|
| 122 |
+
data: '/static/tesseract/tur.traineddata.gz'
|
| 123 |
+
});
|
| 124 |
+
window.tesseractTurDataLoaded = true;
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
loadTurkishLanguageData();
|
| 129 |
+
}
|
| 130 |
+
);
|
| 131 |
|
| 132 |
async function processFile(file) {
|
| 133 |
const format = outputFormat.value;
|
|
|
|
| 262 |
logger: m => console.log(m),
|
| 263 |
preserve_interword_spaces: true,
|
| 264 |
tessedit_pageseg_mode: 6, // Assume a single uniform block of text
|
| 265 |
+
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ğüşıöçĞÜŞİÖÇ', // Added Turkish chars
|
| 266 |
+
tessedit_create_hocr: 1, // Include formatting info
|
| 267 |
+
load_system_dawg: 1,
|
| 268 |
+
load_freq_dawg: 1,
|
| 269 |
+
user_words_suffix: 'tur',
|
| 270 |
+
user_patterns_suffix: 'tur'
|
| 271 |
+
}
|
| 272 |
).then(({ data: { text, hocr } }) => {
|
| 273 |
if (outputFormat.value === 'formatted') {
|
| 274 |
// Process formatted output similar to Adobe/Abbyy
|
|
|
|
| 279 |
}
|
| 280 |
}).catch(reject);
|
| 281 |
});
|
|
|
|
| 282 |
function processFormattedOCR(hocr) {
|
| 283 |
// Parse hOCR output to preserve formatting and layout
|
| 284 |
const parser = new DOMParser();
|
|
|
|
| 298 |
const wordConfidence = parseFloat(word.getAttribute('title')
|
| 299 |
.match(/x_wconf (\d+)/)[1]);
|
| 300 |
|
| 301 |
+
// Better handling of Turkish characters and confidence
|
| 302 |
+
if (wordConfidence < 50) {
|
| 303 |
lineText += `[${wordText}] `;
|
| 304 |
+
} else if (wordConfidence < 70) {
|
| 305 |
+
lineText += `<span confidence-medium>${wordText}</span> `;
|
| 306 |
+
} else if (wordConfidence < 85 && /[ğüşıöçĞÜŞİÖÇ]/.test(wordText)) {
|
| 307 |
lineText += `${wordText}`;
|
| 308 |
} else {
|
| 309 |
lineText += `${wordText} `;
|
| 310 |
}
|
| 311 |
});
|
| 312 |
|
| 313 |
+
// Better line spacing for Turkish text
|
| 314 |
+
formattedText += lineText.trim() + '\n\n';
|
| 315 |
});
|
| 316 |
|
| 317 |
formattedText += '\n';
|
| 318 |
});
|
| 319 |
|
| 320 |
return formattedText;
|
| 321 |
+
}
|
| 322 |
}
|
| 323 |
|
| 324 |
function displayResult(result) {
|
style.css
CHANGED
|
@@ -39,9 +39,21 @@ pre [confidence-low] {
|
|
| 39 |
color: #c62828;
|
| 40 |
padding: 0 2px;
|
| 41 |
}
|
| 42 |
-
|
| 43 |
pre [confidence-medium] {
|
| 44 |
background-color: #fff8e1;
|
| 45 |
color: #f57f17;
|
| 46 |
padding: 0 2px;
|
| 47 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
color: #c62828;
|
| 40 |
padding: 0 2px;
|
| 41 |
}
|
|
|
|
| 42 |
pre [confidence-medium] {
|
| 43 |
background-color: #fff8e1;
|
| 44 |
color: #f57f17;
|
| 45 |
padding: 0 2px;
|
| 46 |
}
|
| 47 |
+
|
| 48 |
+
/* Turkish specific OCR styling */
|
| 49 |
+
.turkish-text {
|
| 50 |
+
font-family: 'Noto Sans', sans-serif;
|
| 51 |
+
line-height: 1.8;
|
| 52 |
+
letter-spacing: 0.5px;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
.ocr-result {
|
| 56 |
+
white-space: pre-wrap;
|
| 57 |
+
font-size: 1.1rem;
|
| 58 |
+
font-family: 'Segoe UI', Tahoma, sans-serif;
|
| 59 |
+
}
|