Update app.py
Browse files
app.py
CHANGED
|
@@ -21,7 +21,7 @@ def load_and_compare_documents(file1, file2):
|
|
| 21 |
# Perform OCR-based comparison across all pages
|
| 22 |
ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content)
|
| 23 |
|
| 24 |
-
# Generate a PDF with side-by-side comparisons and observation tables
|
| 25 |
pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences)
|
| 26 |
|
| 27 |
# Compile an overall summary of differences
|
|
@@ -76,11 +76,11 @@ def perform_ocr_and_compare(content1, content2):
|
|
| 76 |
|
| 77 |
for i, change in enumerate(diff):
|
| 78 |
if change.startswith("+ "):
|
| 79 |
-
page_diffs.append(
|
| 80 |
elif change.startswith("- "):
|
| 81 |
-
page_diffs.append(
|
| 82 |
elif change.startswith("? "):
|
| 83 |
-
page_diffs.append(
|
| 84 |
|
| 85 |
ocr_differences.append({"page": page_num, "differences": page_diffs})
|
| 86 |
|
|
@@ -91,7 +91,6 @@ def perform_ocr_and_compare(content1, content2):
|
|
| 91 |
flattened_bbox = [coord for point in bbox for coord in point]
|
| 92 |
draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
|
| 93 |
draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
|
| 94 |
-
diff_index += 1
|
| 95 |
|
| 96 |
marked_images_1[page_num] = marked_img1
|
| 97 |
marked_images_2[page_num] = marked_img2
|
|
@@ -137,12 +136,12 @@ def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differenc
|
|
| 137 |
for ocr_diff in ocr_differences:
|
| 138 |
if ocr_diff["page"] == page_num:
|
| 139 |
for diff in ocr_diff["differences"]:
|
| 140 |
-
if "
|
| 141 |
-
data["Additions"].append(diff)
|
| 142 |
-
elif "
|
| 143 |
-
data["Deletions"].append(diff)
|
| 144 |
-
elif "
|
| 145 |
-
data["Modifications"].append(diff)
|
| 146 |
|
| 147 |
# Convert data to DataFrame for formatting
|
| 148 |
df = pd.DataFrame.from_dict(data, orient="index").transpose()
|
|
@@ -164,9 +163,9 @@ def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differenc
|
|
| 164 |
return pdf_buffer
|
| 165 |
|
| 166 |
def generate_overall_summary(ocr_differences):
|
| 167 |
-
total_additions = sum(len(
|
| 168 |
-
total_deletions = sum(len(
|
| 169 |
-
total_modifications = sum(len(
|
| 170 |
|
| 171 |
overall_summary = {
|
| 172 |
"total_additions": total_additions,
|
|
|
|
| 21 |
# Perform OCR-based comparison across all pages
|
| 22 |
ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content)
|
| 23 |
|
| 24 |
+
# Generate a PDF with side-by-side comparisons and detailed observation tables
|
| 25 |
pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences)
|
| 26 |
|
| 27 |
# Compile an overall summary of differences
|
|
|
|
| 76 |
|
| 77 |
for i, change in enumerate(diff):
|
| 78 |
if change.startswith("+ "):
|
| 79 |
+
page_diffs.append({"type": "Added", "value": change[2:], "index": i})
|
| 80 |
elif change.startswith("- "):
|
| 81 |
+
page_diffs.append({"type": "Deleted", "value": change[2:], "index": i})
|
| 82 |
elif change.startswith("? "):
|
| 83 |
+
page_diffs.append({"type": "Modified", "value": change[2:], "index": i})
|
| 84 |
|
| 85 |
ocr_differences.append({"page": page_num, "differences": page_diffs})
|
| 86 |
|
|
|
|
| 91 |
flattened_bbox = [coord for point in bbox for coord in point]
|
| 92 |
draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
|
| 93 |
draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
|
|
|
|
| 94 |
|
| 95 |
marked_images_1[page_num] = marked_img1
|
| 96 |
marked_images_2[page_num] = marked_img2
|
|
|
|
| 136 |
for ocr_diff in ocr_differences:
|
| 137 |
if ocr_diff["page"] == page_num:
|
| 138 |
for diff in ocr_diff["differences"]:
|
| 139 |
+
if diff["type"] == "Added":
|
| 140 |
+
data["Additions"].append(f"{diff['value']} (Index: {diff['index']})")
|
| 141 |
+
elif diff["type"] == "Deleted":
|
| 142 |
+
data["Deletions"].append(f"{diff['value']} (Index: {diff['index']})")
|
| 143 |
+
elif diff["type"] == "Modified":
|
| 144 |
+
data["Modifications"].append(f"{diff['value']} (Index: {diff['index']})")
|
| 145 |
|
| 146 |
# Convert data to DataFrame for formatting
|
| 147 |
df = pd.DataFrame.from_dict(data, orient="index").transpose()
|
|
|
|
| 163 |
return pdf_buffer
|
| 164 |
|
| 165 |
def generate_overall_summary(ocr_differences):
|
| 166 |
+
total_additions = sum(len([d for d in diff["differences"] if d["type"] == "Added"]) for diff in ocr_differences)
|
| 167 |
+
total_deletions = sum(len([d for d in diff["differences"] if d["type"] == "Deleted"]) for diff in ocr_differences)
|
| 168 |
+
total_modifications = sum(len([d for d in diff["differences"] if d["type"] == "Modified"]) for diff in ocr_differences)
|
| 169 |
|
| 170 |
overall_summary = {
|
| 171 |
"total_additions": total_additions,
|