SuriRaja commited on
Commit
2d38d1d
·
verified ·
1 Parent(s): c91c330

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -14
app.py CHANGED
@@ -21,7 +21,7 @@ def load_and_compare_documents(file1, file2):
21
  # Perform OCR-based comparison across all pages
22
  ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content)
23
 
24
- # Generate a PDF with side-by-side comparisons and observation tables
25
  pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences)
26
 
27
  # Compile an overall summary of differences
@@ -76,11 +76,11 @@ def perform_ocr_and_compare(content1, content2):
76
 
77
  for i, change in enumerate(diff):
78
  if change.startswith("+ "):
79
- page_diffs.append(f"Added '{change[2:]}' at position {i} on page {page_num}")
80
  elif change.startswith("- "):
81
- page_diffs.append(f"Deleted '{change[2:]}' at position {i} on page {page_num}")
82
  elif change.startswith("? "):
83
- page_diffs.append(f"Modified '{change[2:]}' at position {i} on page {page_num}")
84
 
85
  ocr_differences.append({"page": page_num, "differences": page_diffs})
86
 
@@ -91,7 +91,6 @@ def perform_ocr_and_compare(content1, content2):
91
  flattened_bbox = [coord for point in bbox for coord in point]
92
  draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
93
  draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
94
- diff_index += 1
95
 
96
  marked_images_1[page_num] = marked_img1
97
  marked_images_2[page_num] = marked_img2
@@ -137,12 +136,12 @@ def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differenc
137
  for ocr_diff in ocr_differences:
138
  if ocr_diff["page"] == page_num:
139
  for diff in ocr_diff["differences"]:
140
- if "Added" in diff:
141
- data["Additions"].append(diff)
142
- elif "Deleted" in diff:
143
- data["Deletions"].append(diff)
144
- elif "Modified" in diff:
145
- data["Modifications"].append(diff)
146
 
147
  # Convert data to DataFrame for formatting
148
  df = pd.DataFrame.from_dict(data, orient="index").transpose()
@@ -164,9 +163,9 @@ def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differenc
164
  return pdf_buffer
165
 
166
  def generate_overall_summary(ocr_differences):
167
- total_additions = sum(len(diff["differences"]) for diff in ocr_differences if any("Added" in d for d in diff["differences"]))
168
- total_deletions = sum(len(diff["differences"]) for diff in ocr_differences if any("Deleted" in d for d in diff["differences"]))
169
- total_modifications = sum(len(diff["differences"]) for diff in ocr_differences if any("Modified" in d for d in diff["differences"]))
170
 
171
  overall_summary = {
172
  "total_additions": total_additions,
 
21
  # Perform OCR-based comparison across all pages
22
  ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content)
23
 
24
+ # Generate a PDF with side-by-side comparisons and detailed observation tables
25
  pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences)
26
 
27
  # Compile an overall summary of differences
 
76
 
77
  for i, change in enumerate(diff):
78
  if change.startswith("+ "):
79
+ page_diffs.append({"type": "Added", "value": change[2:], "index": i})
80
  elif change.startswith("- "):
81
+ page_diffs.append({"type": "Deleted", "value": change[2:], "index": i})
82
  elif change.startswith("? "):
83
+ page_diffs.append({"type": "Modified", "value": change[2:], "index": i})
84
 
85
  ocr_differences.append({"page": page_num, "differences": page_diffs})
86
 
 
91
  flattened_bbox = [coord for point in bbox for coord in point]
92
  draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
93
  draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
 
94
 
95
  marked_images_1[page_num] = marked_img1
96
  marked_images_2[page_num] = marked_img2
 
136
  for ocr_diff in ocr_differences:
137
  if ocr_diff["page"] == page_num:
138
  for diff in ocr_diff["differences"]:
139
+ if diff["type"] == "Added":
140
+ data["Additions"].append(f"{diff['value']} (Index: {diff['index']})")
141
+ elif diff["type"] == "Deleted":
142
+ data["Deletions"].append(f"{diff['value']} (Index: {diff['index']})")
143
+ elif diff["type"] == "Modified":
144
+ data["Modifications"].append(f"{diff['value']} (Index: {diff['index']})")
145
 
146
  # Convert data to DataFrame for formatting
147
  df = pd.DataFrame.from_dict(data, orient="index").transpose()
 
163
  return pdf_buffer
164
 
165
  def generate_overall_summary(ocr_differences):
166
+ total_additions = sum(len([d for d in diff["differences"] if d["type"] == "Added"]) for diff in ocr_differences)
167
+ total_deletions = sum(len([d for d in diff["differences"] if d["type"] == "Deleted"]) for diff in ocr_differences)
168
+ total_modifications = sum(len([d for d in diff["differences"] if d["type"] == "Modified"]) for diff in ocr_differences)
169
 
170
  overall_summary = {
171
  "total_additions": total_additions,