ASDAD34 commited on
Commit
4b28095
·
verified ·
1 Parent(s): 3a12466

İçeriğini ocr yapamıyor.

Browse files
Files changed (2) hide show
  1. script.js +29 -10
  2. style.css +13 -1
script.js CHANGED
@@ -114,7 +114,20 @@ document.addEventListener('DOMContentLoaded', function() {
114
  processBtn.innerHTML = '<i data-feather="cpu" class="mr-2"></i> Process Files';
115
  feather.replace();
116
  }
117
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  async function processFile(file) {
120
  const format = outputFormat.value;
@@ -249,9 +262,13 @@ async function extractTextFromImage(file) {
249
  logger: m => console.log(m),
250
  preserve_interword_spaces: true,
251
  tessedit_pageseg_mode: 6, // Assume a single uniform block of text
252
- tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ',
253
- tessedit_create_hocr: 1 // Include formatting info
254
- }
 
 
 
 
255
  ).then(({ data: { text, hocr } }) => {
256
  if (outputFormat.value === 'formatted') {
257
  // Process formatted output similar to Adobe/Abbyy
@@ -262,7 +279,6 @@ async function extractTextFromImage(file) {
262
  }
263
  }).catch(reject);
264
  });
265
-
266
  function processFormattedOCR(hocr) {
267
  // Parse hOCR output to preserve formatting and layout
268
  const parser = new DOMParser();
@@ -282,24 +298,27 @@ async function extractTextFromImage(file) {
282
  const wordConfidence = parseFloat(word.getAttribute('title')
283
  .match(/x_wconf (\d+)/)[1]);
284
 
285
- // Apply formatting based on confidence and context
286
- if (wordConfidence < 60) {
287
  lineText += `[${wordText}] `;
288
- } else if (wordConfidence < 80 && index > 0) {
 
 
289
  lineText += `${wordText}`;
290
  } else {
291
  lineText += `${wordText} `;
292
  }
293
  });
294
 
295
- formattedText += lineText.trim() + '\n';
 
296
  });
297
 
298
  formattedText += '\n';
299
  });
300
 
301
  return formattedText;
302
- }
303
  }
304
 
305
  function displayResult(result) {
 
114
  processBtn.innerHTML = '<i data-feather="cpu" class="mr-2"></i> Process Files';
115
  feather.replace();
116
  }
117
+
118
+ // Load additional Turkish language data
119
+ function loadTurkishLanguageData() {
120
+ if (!window.tesseractTurDataLoaded) {
121
+ Tesseract.addLanguageData('tur', {
122
+ data: '/static/tesseract/tur.traineddata.gz'
123
+ });
124
+ window.tesseractTurDataLoaded = true;
125
+ }
126
+ }
127
+
128
+ loadTurkishLanguageData();
129
+ }
130
+ );
131
 
132
  async function processFile(file) {
133
  const format = outputFormat.value;
 
262
  logger: m => console.log(m),
263
  preserve_interword_spaces: true,
264
  tessedit_pageseg_mode: 6, // Assume a single uniform block of text
265
+ tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ğüşıöçĞÜŞİÖÇ', // Added Turkish chars
266
+ tessedit_create_hocr: 1, // Include formatting info
267
+ load_system_dawg: 1,
268
+ load_freq_dawg: 1,
269
+ user_words_suffix: 'tur',
270
+ user_patterns_suffix: 'tur'
271
+ }
272
  ).then(({ data: { text, hocr } }) => {
273
  if (outputFormat.value === 'formatted') {
274
  // Process formatted output similar to Adobe/Abbyy
 
279
  }
280
  }).catch(reject);
281
  });
 
282
  function processFormattedOCR(hocr) {
283
  // Parse hOCR output to preserve formatting and layout
284
  const parser = new DOMParser();
 
298
  const wordConfidence = parseFloat(word.getAttribute('title')
299
  .match(/x_wconf (\d+)/)[1]);
300
 
301
+ // Better handling of Turkish characters and confidence
302
+ if (wordConfidence < 50) {
303
  lineText += `[${wordText}] `;
304
+ } else if (wordConfidence < 70) {
305
+ lineText += `<span confidence-medium>${wordText}</span> `;
306
+ } else if (wordConfidence < 85 && /[ğüşıöçĞÜŞİÖÇ]/.test(wordText)) {
307
  lineText += `${wordText}`;
308
  } else {
309
  lineText += `${wordText} `;
310
  }
311
  });
312
 
313
+ // Better line spacing for Turkish text
314
+ formattedText += lineText.trim() + '\n\n';
315
  });
316
 
317
  formattedText += '\n';
318
  });
319
 
320
  return formattedText;
321
+ }
322
  }
323
 
324
  function displayResult(result) {
style.css CHANGED
@@ -39,9 +39,21 @@ pre [confidence-low] {
39
  color: #c62828;
40
  padding: 0 2px;
41
  }
42
-
43
  pre [confidence-medium] {
44
  background-color: #fff8e1;
45
  color: #f57f17;
46
  padding: 0 2px;
47
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  color: #c62828;
40
  padding: 0 2px;
41
  }
 
42
  pre [confidence-medium] {
43
  background-color: #fff8e1;
44
  color: #f57f17;
45
  padding: 0 2px;
46
  }
47
+
48
+ /* Turkish specific OCR styling */
49
+ .turkish-text {
50
+ font-family: 'Noto Sans', sans-serif;
51
+ line-height: 1.8;
52
+ letter-spacing: 0.5px;
53
+ }
54
+
55
+ .ocr-result {
56
+ white-space: pre-wrap;
57
+ font-size: 1.1rem;
58
+ font-family: 'Segoe UI', Tahoma, sans-serif;
59
+ }