SuriRaja commited on
Commit
2e4777d
·
verified ·
1 Parent(s): 5adc3d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -5
app.py CHANGED
@@ -2,12 +2,12 @@ import streamlit as st
2
  import fitz # PyMuPDF
3
  import difflib
4
  from PIL import Image, ImageChops, ImageDraw
5
- import pytesseract
6
  import io
7
  import re
8
 
9
- # Set up Tesseract path if needed (adjust as per system requirements)
10
- # pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
11
 
12
  def load_and_compare_documents(file1, file2):
13
  file1_content = file1.read()
@@ -141,8 +141,8 @@ def perform_ocr_and_compare(content1, content2):
141
  images2 = pdf_to_images(content2)
142
 
143
  for (page_num, img1), (_, img2) in zip(images1, images2):
144
- text1 = pytesseract.image_to_string(img1)
145
- text2 = pytesseract.image_to_string(img2)
146
 
147
  if text1 != text2:
148
  diff = list(difflib.ndiff(text1, text2))
 
2
  import fitz # PyMuPDF
3
  import difflib
4
  from PIL import Image, ImageChops, ImageDraw
5
+ import easyocr
6
  import io
7
  import re
8
 
9
+ # Initialize the easyocr Reader
10
+ ocr_reader = easyocr.Reader(['en'])
11
 
12
  def load_and_compare_documents(file1, file2):
13
  file1_content = file1.read()
 
141
  images2 = pdf_to_images(content2)
142
 
143
  for (page_num, img1), (_, img2) in zip(images1, images2):
144
+ text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1)])
145
+ text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2)])
146
 
147
  if text1 != text2:
148
  diff = list(difflib.ndiff(text1, text2))