regulens / scripts /agentic_pdfeditor.py
amougou-fortiss's picture
Upload 2 files
5820bde verified
import pymupdf
from scripts.utility_functions import color_mapping, get_best_fuzzy_match
def add_infos_to_pdf_agentic(doc, changes, successful_annotations, extraction_method="Landing AI", nlp_preprocessing=True):
type_counts = {
"addition": 0,
"modification": 0,
"deletion": 0,
"unspecified": 0,
}
for change in changes:
change_type = change.get("type", "unspecified")
if change_type in type_counts:
type_counts[change_type] += 1
else:
type_counts["unspecified"] += 1
summary_text = (
"Regulatory Summary:\n"
f"- Extraction Method: {extraction_method}\n"
f"- Nlp preprocessing: {'yes' if nlp_preprocessing else 'no'}\n"
f"- Total Changes: {len(changes)}, Successful Annotations: {successful_annotations}\n"
f"- Additions: {type_counts.get('addition', 0)}\n"
f"- Deletions: {type_counts.get('deletion', 0)}\n"
f"- Modifications: {type_counts.get('modification', 0)}\n"
)
page = doc.load_page(0)
rect = pymupdf.Rect(10, 10, 550, 150)
page.insert_textbox(
rect,
summary_text,
fontsize=9,
fontname="helv",
align=pymupdf.TEXT_ALIGN_LEFT,
color=(0, 0, 0.7),
overlay=True,
)
metadata = doc.metadata
metadata["title"] = "Annotated " + (
metadata["title"] if metadata["title"] else "PDF"
)
metadata["author"] = "Fortiss ReguLens" + (
" & " + metadata["author"] if metadata["author"] else ""
)
metadata["subject"] = "Annotated PDF with regulatory changes"
metadata["keywords"] = "regulatory, changes, annotations, pdf"
doc.set_metadata(metadata)
def add_failed_annotations_to_pdf_agentic(doc, failed_annotations):
"""
Doc is edited in place.
Adds failed annotations to the end of the PDF document.
:param doc: The PyMuPDF document object.
:type doc: pymupdf.Document
:param failed_annotations: The failed annotations to be added.
:type failed_annotations: array
"""
if not failed_annotations:
return
page = doc.new_page(pno=-1)
annotation_str = "Failed Annotations:\n"
for failed_annotation in failed_annotations:
text = failed_annotation["change"]["text"]
change_type = failed_annotation["change"]["type"]
change_str = failed_annotation["change"]["category"]
page_num = failed_annotation["page"]
annotation_str += (
f"Page {page_num}: {text} ({change_type}) Change: {change_str}\n"
)
rect = pymupdf.Rect(20, 20, 580, 822)
page.insert_textbox(
rect,
annotation_str,
fontsize=9,
fontname="helv",
align=pymupdf.TEXT_ALIGN_LEFT,
color=(0, 0, 0.7),
)
def agentic_pdf_annotator(changes, file_bytes, extraction_method="Landing AI", nlp_preprocessing=True):
changes = [
c for c in changes if c.get("confirmed", False) and c.get("validated", False)
]
if not changes:
return ""
successful_annotations = 0
failed_annotations = []
try:
doc = pymupdf.open(stream=file_bytes, filetype="pdf")
except Exception as e:
return ""
# Sort by length of relevant_text in descending order to avoid overlapping highlights
changes = sorted(changes, key=lambda c: -len(c["text"]))
annotated_areas = {}
full_text = ""
for page_num in range(len(doc)):
page = doc[page_num]
full_text += page.get_text()
for change in changes:
page_num = int(change["grounding"][0]["page"])
text = change["text"]
change_type = change["type"]
change_str = change["category"]
comment = change["context"]
if page_num < 0 or page_num >= len(doc):
results = []
for pnr in range(len(doc)): # search all pages
annotated_areas.setdefault(f"{pnr}", [])
page = doc.load_page(pnr)
text_instances = page.search_for(text)
for inst in text_instances:
page_num = pnr# remove?
results.append({"page": pnr, "bbox": inst})
results = list(
filter(
lambda result: not any(
result["bbox"].intersects(area)
for area in annotated_areas[f"{result['page']}"]
),
results,
)
)
if not results:
best_match = get_best_fuzzy_match(full_text, change)
if best_match and len(best_match) > 0:
print("found best fuzzy match: ", best_match)
for page_num in range(len(doc)): # search all pages
page = doc.load_page(page_num)
text_instances = page.search_for(best_match)
for inst in text_instances:
results.append({"page": page_num, "bbox": inst})
# we only want the results that do not overlap with already annotated areas
results = list(
filter(
lambda result: not any(
result["bbox"].intersects(area)
for area in annotated_areas[f"{result['page']}"]
),
results,
)
)
if results: # "flattenning" the results
page_num = results[0]["page"]
doc_page = doc.load_page(page_num)
results = [r["bbox"] for r in results if r["page"] == page_num]
else:
doc_page = doc.load_page(page_num)
annotated_areas.setdefault(f"{page_num}", [])
# Search for the relevant text on the page
results = doc_page.search_for(text)
# we only want the results that do not overlap with already annotated areas
results = list(
filter(
lambda result: not any(
result.intersects(area)
for area in annotated_areas[f"{page_num}"]
),
results,
)
)
if not results:
best_match = get_best_fuzzy_match(
doc_page.get_text(option="text"), change
)
if best_match and len(best_match) > 0:
results = doc_page.search_for(best_match)
print("found best fuzzy match: ", best_match)
# we only want the results that do not overlap with already annotated areas
results = list(
filter(
lambda result: not any(
result.intersects(area)
for area in annotated_areas[f"{page_num}"]
),
results,
)
)
if not results:
print(f"No non-overlapping match found on page {page_num} for: '{text}'")
failed_annotations.append({"change": change, "page": page_num})
continue
color = color_mapping.get(change_type, (1, 1, 0))
annotated_areas[f"{page_num}"].append(results[0])
highlight = doc_page.add_highlight_annot(results[0])
highlight.set_colors({"stroke": color})
highlight.set_info(
info={
"title": "Comment",
"content": f"{change_type} - {change_str}\n{comment}",
"name": change_type,
}
)
highlight.update()
successful_annotations += 1
# if the resulting rects contain anything other than our search text we know it is a multiline highlight because for each line
# we will have a new result rect. We need to check if the text in the rect is not equal to our search text but is inside of it
for result in results[1:]:
resulttext = doc_page.get_textbox(result)
if (
(resulttext.strip() != text.strip())
& (resulttext.strip() in text.strip())
# & (
# not any(
# result.intersects(area)
# for area in annotated_areas[f"{page_num}"]
# )
# )
):
highlight = doc_page.add_highlight_annot(result)
highlight.set_colors({"stroke": color})
highlight.update()
annotated_areas[f"{page_num}"].append(result)
add_infos_to_pdf_agentic(doc, changes, successful_annotations, extraction_method, nlp_preprocessing)
add_failed_annotations_to_pdf_agentic(doc, failed_annotations)
result_bytes = doc.tobytes()
return result_bytes