ASDAD34 commited on
Commit
3a12466
·
verified ·
1 Parent(s): 983e5f7

Yapamıyor Türkçe ocr yapacak aynı abbyfine redaer format çeviri gibi Adobe redaer DC ocr format çevirisi gibi çalışmalı

Browse files
Files changed (3) hide show
  1. index.html +2 -1
  2. script.js +56 -7
  3. style.css +16 -2
index.html CHANGED
@@ -42,7 +42,8 @@
42
  <option value="json">JSON</option>
43
  <option value="markdown">Markdown</option>
44
  <option value="text">Plain Text</option>
45
- </select>
 
46
  </div>
47
  <button id="processBtn" class="bg-indigo-600 hover:bg-indigo-700 text-white font-medium py-3 px-6 rounded-lg transition duration-200 flex items-center justify-center mt-6 sm:mt-auto">
48
  <i data-feather="cpu" class="mr-2"></i> Process Files
 
42
  <option value="json">JSON</option>
43
  <option value="markdown">Markdown</option>
44
  <option value="text">Plain Text</option>
45
+ <option value="formatted">Formatted Text</option>
46
+ </select>
47
  </div>
48
  <button id="processBtn" class="bg-indigo-600 hover:bg-indigo-700 text-white font-medium py-3 px-6 rounded-lg transition duration-200 flex items-center justify-center mt-6 sm:mt-auto">
49
  <i data-feather="cpu" class="mr-2"></i> Process Files
script.js CHANGED
@@ -240,18 +240,67 @@ document.addEventListener('DOMContentLoaded', function() {
240
  reader.readAsArrayBuffer(file);
241
  });
242
  }
243
-
244
- async function extractTextFromImage(file) {
245
  return new Promise((resolve, reject) => {
246
  Tesseract.recognize(
247
  file,
248
- 'eng',
249
- { logger: m => console.log(m) }
250
- ).then(({ data: { text } }) => {
251
- resolve(text);
 
 
 
 
 
 
 
 
 
 
 
 
252
  }).catch(reject);
253
  });
254
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
  function displayResult(result) {
257
  const resultCard = document.createElement('div');
 
240
  reader.readAsArrayBuffer(file);
241
  });
242
  }
243
+ async function extractTextFromImage(file) {
 
244
  return new Promise((resolve, reject) => {
245
  Tesseract.recognize(
246
  file,
247
+ 'tur+eng', // Turkish + English languages
248
+ {
249
+ logger: m => console.log(m),
250
+ preserve_interword_spaces: true,
251
+ tessedit_pageseg_mode: 6, // Assume a single uniform block of text
252
+ tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ',
253
+ tessedit_create_hocr: 1 // Include formatting info
254
+ }
255
+ ).then(({ data: { text, hocr } }) => {
256
+ if (outputFormat.value === 'formatted') {
257
+ // Process formatted output similar to Adobe/Abbyy
258
+ const formatted = processFormattedOCR(hocr);
259
+ resolve(formatted);
260
+ } else {
261
+ resolve(text);
262
+ }
263
  }).catch(reject);
264
  });
265
+
266
+ function processFormattedOCR(hocr) {
267
+ // Parse hOCR output to preserve formatting and layout
268
+ const parser = new DOMParser();
269
+ const doc = parser.parseFromString(hocr, 'text/html');
270
+ const paragraphs = doc.querySelectorAll('.ocr_par');
271
+
272
+ let formattedText = '';
273
+
274
+ paragraphs.forEach(par => {
275
+ const lines = par.querySelectorAll('.ocr_line');
276
+ lines.forEach(line => {
277
+ const words = line.querySelectorAll('.ocrx_word');
278
+ let lineText = '';
279
+
280
+ words.forEach((word, index) => {
281
+ const wordText = word.textContent || '';
282
+ const wordConfidence = parseFloat(word.getAttribute('title')
283
+ .match(/x_wconf (\d+)/)[1]);
284
+
285
+ // Apply formatting based on confidence and context
286
+ if (wordConfidence < 60) {
287
+ lineText += `[${wordText}] `;
288
+ } else if (wordConfidence < 80 && index > 0) {
289
+ lineText += `${wordText}`;
290
+ } else {
291
+ lineText += `${wordText} `;
292
+ }
293
+ });
294
+
295
+ formattedText += lineText.trim() + '\n';
296
+ });
297
+
298
+ formattedText += '\n';
299
+ });
300
+
301
+ return formattedText;
302
+ }
303
+ }
304
 
305
  function displayResult(result) {
306
  const resultCard = document.createElement('div');
style.css CHANGED
@@ -20,7 +20,6 @@
20
  max-height: 150px;
21
  object-fit: contain;
22
  }
23
-
24
  #resultsContainer pre {
25
  background-color: #f8f9fa;
26
  padding: 16px;
@@ -30,4 +29,19 @@
30
  word-wrap: break-word;
31
  max-height: 500px;
32
  overflow-y: auto;
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  max-height: 150px;
21
  object-fit: contain;
22
  }
 
23
  #resultsContainer pre {
24
  background-color: #f8f9fa;
25
  padding: 16px;
 
29
  word-wrap: break-word;
30
  max-height: 500px;
31
  overflow-y: auto;
32
+ font-family: monospace;
33
+ line-height: 1.5;
34
+ }
35
+
36
+ /* OCR confidence styling */
37
+ pre [confidence-low] {
38
+ background-color: #ffebee;
39
+ color: #c62828;
40
+ padding: 0 2px;
41
+ }
42
+
43
+ pre [confidence-medium] {
44
+ background-color: #fff8e1;
45
+ color: #f57f17;
46
+ padding: 0 2px;
47
+ }