Spaces:
Running
Running
Yapamıyor Türkçe ocr yapacak aynı abbyfine redaer format çeviri gibi Adobe redaer DC ocr format çevirisi gibi çalışmalı
Browse files- index.html +2 -1
- script.js +56 -7
- style.css +16 -2
index.html
CHANGED
|
@@ -42,7 +42,8 @@
|
|
| 42 |
<option value="json">JSON</option>
|
| 43 |
<option value="markdown">Markdown</option>
|
| 44 |
<option value="text">Plain Text</option>
|
| 45 |
-
|
|
|
|
| 46 |
</div>
|
| 47 |
<button id="processBtn" class="bg-indigo-600 hover:bg-indigo-700 text-white font-medium py-3 px-6 rounded-lg transition duration-200 flex items-center justify-center mt-6 sm:mt-auto">
|
| 48 |
<i data-feather="cpu" class="mr-2"></i> Process Files
|
|
|
|
| 42 |
<option value="json">JSON</option>
|
| 43 |
<option value="markdown">Markdown</option>
|
| 44 |
<option value="text">Plain Text</option>
|
| 45 |
+
<option value="formatted">Formatted Text</option>
|
| 46 |
+
</select>
|
| 47 |
</div>
|
| 48 |
<button id="processBtn" class="bg-indigo-600 hover:bg-indigo-700 text-white font-medium py-3 px-6 rounded-lg transition duration-200 flex items-center justify-center mt-6 sm:mt-auto">
|
| 49 |
<i data-feather="cpu" class="mr-2"></i> Process Files
|
script.js
CHANGED
|
@@ -240,18 +240,67 @@ document.addEventListener('DOMContentLoaded', function() {
|
|
| 240 |
reader.readAsArrayBuffer(file);
|
| 241 |
});
|
| 242 |
}
|
| 243 |
-
|
| 244 |
-
async function extractTextFromImage(file) {
|
| 245 |
return new Promise((resolve, reject) => {
|
| 246 |
Tesseract.recognize(
|
| 247 |
file,
|
| 248 |
-
'eng',
|
| 249 |
-
{
|
| 250 |
-
|
| 251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
}).catch(reject);
|
| 253 |
});
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
function displayResult(result) {
|
| 257 |
const resultCard = document.createElement('div');
|
|
|
|
| 240 |
reader.readAsArrayBuffer(file);
|
| 241 |
});
|
| 242 |
}
|
| 243 |
+
async function extractTextFromImage(file) {
|
|
|
|
| 244 |
return new Promise((resolve, reject) => {
|
| 245 |
Tesseract.recognize(
|
| 246 |
file,
|
| 247 |
+
'tur+eng', // Turkish + English languages
|
| 248 |
+
{
|
| 249 |
+
logger: m => console.log(m),
|
| 250 |
+
preserve_interword_spaces: true,
|
| 251 |
+
tessedit_pageseg_mode: 6, // Assume a single uniform block of text
|
| 252 |
+
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ',
|
| 253 |
+
tessedit_create_hocr: 1 // Include formatting info
|
| 254 |
+
}
|
| 255 |
+
).then(({ data: { text, hocr } }) => {
|
| 256 |
+
if (outputFormat.value === 'formatted') {
|
| 257 |
+
// Process formatted output similar to Adobe/Abbyy
|
| 258 |
+
const formatted = processFormattedOCR(hocr);
|
| 259 |
+
resolve(formatted);
|
| 260 |
+
} else {
|
| 261 |
+
resolve(text);
|
| 262 |
+
}
|
| 263 |
}).catch(reject);
|
| 264 |
});
|
| 265 |
+
|
| 266 |
+
function processFormattedOCR(hocr) {
|
| 267 |
+
// Parse hOCR output to preserve formatting and layout
|
| 268 |
+
const parser = new DOMParser();
|
| 269 |
+
const doc = parser.parseFromString(hocr, 'text/html');
|
| 270 |
+
const paragraphs = doc.querySelectorAll('.ocr_par');
|
| 271 |
+
|
| 272 |
+
let formattedText = '';
|
| 273 |
+
|
| 274 |
+
paragraphs.forEach(par => {
|
| 275 |
+
const lines = par.querySelectorAll('.ocr_line');
|
| 276 |
+
lines.forEach(line => {
|
| 277 |
+
const words = line.querySelectorAll('.ocrx_word');
|
| 278 |
+
let lineText = '';
|
| 279 |
+
|
| 280 |
+
words.forEach((word, index) => {
|
| 281 |
+
const wordText = word.textContent || '';
|
| 282 |
+
const wordConfidence = parseFloat(word.getAttribute('title')
|
| 283 |
+
.match(/x_wconf (\d+)/)[1]);
|
| 284 |
+
|
| 285 |
+
// Apply formatting based on confidence and context
|
| 286 |
+
if (wordConfidence < 60) {
|
| 287 |
+
lineText += `[${wordText}] `;
|
| 288 |
+
} else if (wordConfidence < 80 && index > 0) {
|
| 289 |
+
lineText += `${wordText}`;
|
| 290 |
+
} else {
|
| 291 |
+
lineText += `${wordText} `;
|
| 292 |
+
}
|
| 293 |
+
});
|
| 294 |
+
|
| 295 |
+
formattedText += lineText.trim() + '\n';
|
| 296 |
+
});
|
| 297 |
+
|
| 298 |
+
formattedText += '\n';
|
| 299 |
+
});
|
| 300 |
+
|
| 301 |
+
return formattedText;
|
| 302 |
+
}
|
| 303 |
+
}
|
| 304 |
|
| 305 |
function displayResult(result) {
|
| 306 |
const resultCard = document.createElement('div');
|
style.css
CHANGED
|
@@ -20,7 +20,6 @@
|
|
| 20 |
max-height: 150px;
|
| 21 |
object-fit: contain;
|
| 22 |
}
|
| 23 |
-
|
| 24 |
#resultsContainer pre {
|
| 25 |
background-color: #f8f9fa;
|
| 26 |
padding: 16px;
|
|
@@ -30,4 +29,19 @@
|
|
| 30 |
word-wrap: break-word;
|
| 31 |
max-height: 500px;
|
| 32 |
overflow-y: auto;
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
max-height: 150px;
|
| 21 |
object-fit: contain;
|
| 22 |
}
|
|
|
|
| 23 |
#resultsContainer pre {
|
| 24 |
background-color: #f8f9fa;
|
| 25 |
padding: 16px;
|
|
|
|
| 29 |
word-wrap: break-word;
|
| 30 |
max-height: 500px;
|
| 31 |
overflow-y: auto;
|
| 32 |
+
font-family: monospace;
|
| 33 |
+
line-height: 1.5;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
/* OCR confidence styling */
|
| 37 |
+
pre [confidence-low] {
|
| 38 |
+
background-color: #ffebee;
|
| 39 |
+
color: #c62828;
|
| 40 |
+
padding: 0 2px;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
pre [confidence-medium] {
|
| 44 |
+
background-color: #fff8e1;
|
| 45 |
+
color: #f57f17;
|
| 46 |
+
padding: 0 2px;
|
| 47 |
+
}
|