|
|
import pymupdf |
|
|
from scripts.utility_functions import color_mapping, get_best_fuzzy_match |
|
|
|
|
|
|
|
|
def add_infos_to_pdf_agentic(doc, changes, successful_annotations, extraction_method="Landing AI", nlp_preprocessing=True): |
|
|
type_counts = { |
|
|
"addition": 0, |
|
|
"modification": 0, |
|
|
"deletion": 0, |
|
|
"unspecified": 0, |
|
|
} |
|
|
for change in changes: |
|
|
change_type = change.get("type", "unspecified") |
|
|
if change_type in type_counts: |
|
|
type_counts[change_type] += 1 |
|
|
else: |
|
|
type_counts["unspecified"] += 1 |
|
|
|
|
|
summary_text = ( |
|
|
"Regulatory Summary:\n" |
|
|
f"- Extraction Method: {extraction_method}\n" |
|
|
f"- Nlp preprocessing: {'yes' if nlp_preprocessing else 'no'}\n" |
|
|
f"- Total Changes: {len(changes)}, Successful Annotations: {successful_annotations}\n" |
|
|
f"- Additions: {type_counts.get('addition', 0)}\n" |
|
|
f"- Deletions: {type_counts.get('deletion', 0)}\n" |
|
|
f"- Modifications: {type_counts.get('modification', 0)}\n" |
|
|
) |
|
|
page = doc.load_page(0) |
|
|
rect = pymupdf.Rect(10, 10, 550, 150) |
|
|
page.insert_textbox( |
|
|
rect, |
|
|
summary_text, |
|
|
fontsize=9, |
|
|
fontname="helv", |
|
|
align=pymupdf.TEXT_ALIGN_LEFT, |
|
|
color=(0, 0, 0.7), |
|
|
overlay=True, |
|
|
) |
|
|
|
|
|
metadata = doc.metadata |
|
|
metadata["title"] = "Annotated " + ( |
|
|
metadata["title"] if metadata["title"] else "PDF" |
|
|
) |
|
|
metadata["author"] = "Fortiss ReguLens" + ( |
|
|
" & " + metadata["author"] if metadata["author"] else "" |
|
|
) |
|
|
metadata["subject"] = "Annotated PDF with regulatory changes" |
|
|
metadata["keywords"] = "regulatory, changes, annotations, pdf" |
|
|
doc.set_metadata(metadata) |
|
|
|
|
|
|
|
|
def add_failed_annotations_to_pdf_agentic(doc, failed_annotations): |
|
|
""" |
|
|
Doc is edited in place. |
|
|
Adds failed annotations to the end of the PDF document. |
|
|
|
|
|
:param doc: The PyMuPDF document object. |
|
|
:type doc: pymupdf.Document |
|
|
:param failed_annotations: The failed annotations to be added. |
|
|
:type failed_annotations: array |
|
|
""" |
|
|
if not failed_annotations: |
|
|
return |
|
|
page = doc.new_page(pno=-1) |
|
|
annotation_str = "Failed Annotations:\n" |
|
|
for failed_annotation in failed_annotations: |
|
|
text = failed_annotation["change"]["text"] |
|
|
change_type = failed_annotation["change"]["type"] |
|
|
change_str = failed_annotation["change"]["category"] |
|
|
page_num = failed_annotation["page"] |
|
|
annotation_str += ( |
|
|
f"Page {page_num}: {text} ({change_type}) Change: {change_str}\n" |
|
|
) |
|
|
|
|
|
rect = pymupdf.Rect(20, 20, 580, 822) |
|
|
page.insert_textbox( |
|
|
rect, |
|
|
annotation_str, |
|
|
fontsize=9, |
|
|
fontname="helv", |
|
|
align=pymupdf.TEXT_ALIGN_LEFT, |
|
|
color=(0, 0, 0.7), |
|
|
) |
|
|
|
|
|
|
|
|
def agentic_pdf_annotator(changes, file_bytes, extraction_method="Landing AI", nlp_preprocessing=True): |
|
|
changes = [ |
|
|
c for c in changes if c.get("confirmed", False) and c.get("validated", False) |
|
|
] |
|
|
if not changes: |
|
|
return "" |
|
|
successful_annotations = 0 |
|
|
failed_annotations = [] |
|
|
try: |
|
|
doc = pymupdf.open(stream=file_bytes, filetype="pdf") |
|
|
except Exception as e: |
|
|
return "" |
|
|
|
|
|
|
|
|
changes = sorted(changes, key=lambda c: -len(c["text"])) |
|
|
annotated_areas = {} |
|
|
full_text = "" |
|
|
for page_num in range(len(doc)): |
|
|
page = doc[page_num] |
|
|
full_text += page.get_text() |
|
|
for change in changes: |
|
|
page_num = int(change["grounding"][0]["page"]) |
|
|
text = change["text"] |
|
|
change_type = change["type"] |
|
|
change_str = change["category"] |
|
|
comment = change["context"] |
|
|
if page_num < 0 or page_num >= len(doc): |
|
|
results = [] |
|
|
for pnr in range(len(doc)): |
|
|
annotated_areas.setdefault(f"{pnr}", []) |
|
|
page = doc.load_page(pnr) |
|
|
text_instances = page.search_for(text) |
|
|
for inst in text_instances: |
|
|
page_num = pnr |
|
|
results.append({"page": pnr, "bbox": inst}) |
|
|
results = list( |
|
|
filter( |
|
|
lambda result: not any( |
|
|
result["bbox"].intersects(area) |
|
|
for area in annotated_areas[f"{result['page']}"] |
|
|
), |
|
|
results, |
|
|
) |
|
|
) |
|
|
if not results: |
|
|
best_match = get_best_fuzzy_match(full_text, change) |
|
|
if best_match and len(best_match) > 0: |
|
|
print("found best fuzzy match: ", best_match) |
|
|
for page_num in range(len(doc)): |
|
|
page = doc.load_page(page_num) |
|
|
text_instances = page.search_for(best_match) |
|
|
for inst in text_instances: |
|
|
results.append({"page": page_num, "bbox": inst}) |
|
|
|
|
|
results = list( |
|
|
filter( |
|
|
lambda result: not any( |
|
|
result["bbox"].intersects(area) |
|
|
for area in annotated_areas[f"{result['page']}"] |
|
|
), |
|
|
results, |
|
|
) |
|
|
) |
|
|
if results: |
|
|
page_num = results[0]["page"] |
|
|
doc_page = doc.load_page(page_num) |
|
|
results = [r["bbox"] for r in results if r["page"] == page_num] |
|
|
else: |
|
|
doc_page = doc.load_page(page_num) |
|
|
annotated_areas.setdefault(f"{page_num}", []) |
|
|
|
|
|
results = doc_page.search_for(text) |
|
|
|
|
|
results = list( |
|
|
filter( |
|
|
lambda result: not any( |
|
|
result.intersects(area) |
|
|
for area in annotated_areas[f"{page_num}"] |
|
|
), |
|
|
results, |
|
|
) |
|
|
) |
|
|
if not results: |
|
|
best_match = get_best_fuzzy_match( |
|
|
doc_page.get_text(option="text"), change |
|
|
) |
|
|
if best_match and len(best_match) > 0: |
|
|
results = doc_page.search_for(best_match) |
|
|
print("found best fuzzy match: ", best_match) |
|
|
|
|
|
results = list( |
|
|
filter( |
|
|
lambda result: not any( |
|
|
result.intersects(area) |
|
|
for area in annotated_areas[f"{page_num}"] |
|
|
), |
|
|
results, |
|
|
) |
|
|
) |
|
|
if not results: |
|
|
print(f"No non-overlapping match found on page {page_num} for: '{text}'") |
|
|
failed_annotations.append({"change": change, "page": page_num}) |
|
|
continue |
|
|
|
|
|
color = color_mapping.get(change_type, (1, 1, 0)) |
|
|
|
|
|
annotated_areas[f"{page_num}"].append(results[0]) |
|
|
highlight = doc_page.add_highlight_annot(results[0]) |
|
|
highlight.set_colors({"stroke": color}) |
|
|
highlight.set_info( |
|
|
info={ |
|
|
"title": "Comment", |
|
|
"content": f"{change_type} - {change_str}\n{comment}", |
|
|
"name": change_type, |
|
|
} |
|
|
) |
|
|
highlight.update() |
|
|
successful_annotations += 1 |
|
|
|
|
|
|
|
|
|
|
|
for result in results[1:]: |
|
|
resulttext = doc_page.get_textbox(result) |
|
|
if ( |
|
|
(resulttext.strip() != text.strip()) |
|
|
& (resulttext.strip() in text.strip()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
): |
|
|
highlight = doc_page.add_highlight_annot(result) |
|
|
highlight.set_colors({"stroke": color}) |
|
|
highlight.update() |
|
|
annotated_areas[f"{page_num}"].append(result) |
|
|
add_infos_to_pdf_agentic(doc, changes, successful_annotations, extraction_method, nlp_preprocessing) |
|
|
add_failed_annotations_to_pdf_agentic(doc, failed_annotations) |
|
|
result_bytes = doc.tobytes() |
|
|
return result_bytes |
|
|
|