Update app.py
Browse files
app.py
CHANGED
|
@@ -2,12 +2,12 @@ import streamlit as st
|
|
| 2 |
import fitz # PyMuPDF
|
| 3 |
import difflib
|
| 4 |
from PIL import Image, ImageChops, ImageDraw
|
| 5 |
-
import
|
| 6 |
import io
|
| 7 |
import re
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
|
| 12 |
def load_and_compare_documents(file1, file2):
|
| 13 |
file1_content = file1.read()
|
|
@@ -141,8 +141,8 @@ def perform_ocr_and_compare(content1, content2):
|
|
| 141 |
images2 = pdf_to_images(content2)
|
| 142 |
|
| 143 |
for (page_num, img1), (_, img2) in zip(images1, images2):
|
| 144 |
-
text1 =
|
| 145 |
-
text2 =
|
| 146 |
|
| 147 |
if text1 != text2:
|
| 148 |
diff = list(difflib.ndiff(text1, text2))
|
|
|
|
| 2 |
import fitz # PyMuPDF
|
| 3 |
import difflib
|
| 4 |
from PIL import Image, ImageChops, ImageDraw
|
| 5 |
+
import easyocr
|
| 6 |
import io
|
| 7 |
import re
|
| 8 |
|
| 9 |
+
# Initialize the easyocr Reader
|
| 10 |
+
ocr_reader = easyocr.Reader(['en'])
|
| 11 |
|
| 12 |
def load_and_compare_documents(file1, file2):
|
| 13 |
file1_content = file1.read()
|
|
|
|
| 141 |
images2 = pdf_to_images(content2)
|
| 142 |
|
| 143 |
for (page_num, img1), (_, img2) in zip(images1, images2):
|
| 144 |
+
text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1)])
|
| 145 |
+
text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2)])
|
| 146 |
|
| 147 |
if text1 != text2:
|
| 148 |
diff = list(difflib.ndiff(text1, text2))
|