import pymupdf from scripts.utility_functions import color_mapping, get_best_fuzzy_match def add_infos_to_pdf_agentic(doc, changes, successful_annotations, extraction_method="Landing AI", nlp_preprocessing=True): type_counts = { "addition": 0, "modification": 0, "deletion": 0, "unspecified": 0, } for change in changes: change_type = change.get("type", "unspecified") if change_type in type_counts: type_counts[change_type] += 1 else: type_counts["unspecified"] += 1 summary_text = ( "Regulatory Summary:\n" f"- Extraction Method: {extraction_method}\n" f"- Nlp preprocessing: {'yes' if nlp_preprocessing else 'no'}\n" f"- Total Changes: {len(changes)}, Successful Annotations: {successful_annotations}\n" f"- Additions: {type_counts.get('addition', 0)}\n" f"- Deletions: {type_counts.get('deletion', 0)}\n" f"- Modifications: {type_counts.get('modification', 0)}\n" ) page = doc.load_page(0) rect = pymupdf.Rect(10, 10, 550, 150) page.insert_textbox( rect, summary_text, fontsize=9, fontname="helv", align=pymupdf.TEXT_ALIGN_LEFT, color=(0, 0, 0.7), overlay=True, ) metadata = doc.metadata metadata["title"] = "Annotated " + ( metadata["title"] if metadata["title"] else "PDF" ) metadata["author"] = "Fortiss ReguLens" + ( " & " + metadata["author"] if metadata["author"] else "" ) metadata["subject"] = "Annotated PDF with regulatory changes" metadata["keywords"] = "regulatory, changes, annotations, pdf" doc.set_metadata(metadata) def add_failed_annotations_to_pdf_agentic(doc, failed_annotations): """ Doc is edited in place. Adds failed annotations to the end of the PDF document. :param doc: The PyMuPDF document object. :type doc: pymupdf.Document :param failed_annotations: The failed annotations to be added. :type failed_annotations: array """ if not failed_annotations: return page = doc.new_page(pno=-1) annotation_str = "Failed Annotations:\n" for failed_annotation in failed_annotations: text = failed_annotation["change"]["text"] change_type = failed_annotation["change"]["type"] change_str = failed_annotation["change"]["category"] page_num = failed_annotation["page"] annotation_str += ( f"Page {page_num}: {text} ({change_type}) Change: {change_str}\n" ) rect = pymupdf.Rect(20, 20, 580, 822) page.insert_textbox( rect, annotation_str, fontsize=9, fontname="helv", align=pymupdf.TEXT_ALIGN_LEFT, color=(0, 0, 0.7), ) def agentic_pdf_annotator(changes, file_bytes, extraction_method="Landing AI", nlp_preprocessing=True): changes = [ c for c in changes if c.get("confirmed", False) and c.get("validated", False) ] if not changes: return "" successful_annotations = 0 failed_annotations = [] try: doc = pymupdf.open(stream=file_bytes, filetype="pdf") except Exception as e: return "" # Sort by length of relevant_text in descending order to avoid overlapping highlights changes = sorted(changes, key=lambda c: -len(c["text"])) annotated_areas = {} full_text = "" for page_num in range(len(doc)): page = doc[page_num] full_text += page.get_text() for change in changes: page_num = int(change["grounding"][0]["page"]) text = change["text"] change_type = change["type"] change_str = change["category"] comment = change["context"] if page_num < 0 or page_num >= len(doc): results = [] for pnr in range(len(doc)): # search all pages annotated_areas.setdefault(f"{pnr}", []) page = doc.load_page(pnr) text_instances = page.search_for(text) for inst in text_instances: page_num = pnr# remove? results.append({"page": pnr, "bbox": inst}) results = list( filter( lambda result: not any( result["bbox"].intersects(area) for area in annotated_areas[f"{result['page']}"] ), results, ) ) if not results: best_match = get_best_fuzzy_match(full_text, change) if best_match and len(best_match) > 0: print("found best fuzzy match: ", best_match) for page_num in range(len(doc)): # search all pages page = doc.load_page(page_num) text_instances = page.search_for(best_match) for inst in text_instances: results.append({"page": page_num, "bbox": inst}) # we only want the results that do not overlap with already annotated areas results = list( filter( lambda result: not any( result["bbox"].intersects(area) for area in annotated_areas[f"{result['page']}"] ), results, ) ) if results: # "flattenning" the results page_num = results[0]["page"] doc_page = doc.load_page(page_num) results = [r["bbox"] for r in results if r["page"] == page_num] else: doc_page = doc.load_page(page_num) annotated_areas.setdefault(f"{page_num}", []) # Search for the relevant text on the page results = doc_page.search_for(text) # we only want the results that do not overlap with already annotated areas results = list( filter( lambda result: not any( result.intersects(area) for area in annotated_areas[f"{page_num}"] ), results, ) ) if not results: best_match = get_best_fuzzy_match( doc_page.get_text(option="text"), change ) if best_match and len(best_match) > 0: results = doc_page.search_for(best_match) print("found best fuzzy match: ", best_match) # we only want the results that do not overlap with already annotated areas results = list( filter( lambda result: not any( result.intersects(area) for area in annotated_areas[f"{page_num}"] ), results, ) ) if not results: print(f"No non-overlapping match found on page {page_num} for: '{text}'") failed_annotations.append({"change": change, "page": page_num}) continue color = color_mapping.get(change_type, (1, 1, 0)) annotated_areas[f"{page_num}"].append(results[0]) highlight = doc_page.add_highlight_annot(results[0]) highlight.set_colors({"stroke": color}) highlight.set_info( info={ "title": "Comment", "content": f"{change_type} - {change_str}\n{comment}", "name": change_type, } ) highlight.update() successful_annotations += 1 # if the resulting rects contain anything other than our search text we know it is a multiline highlight because for each line # we will have a new result rect. We need to check if the text in the rect is not equal to our search text but is inside of it for result in results[1:]: resulttext = doc_page.get_textbox(result) if ( (resulttext.strip() != text.strip()) & (resulttext.strip() in text.strip()) # & ( # not any( # result.intersects(area) # for area in annotated_areas[f"{page_num}"] # ) # ) ): highlight = doc_page.add_highlight_annot(result) highlight.set_colors({"stroke": color}) highlight.update() annotated_areas[f"{page_num}"].append(result) add_infos_to_pdf_agentic(doc, changes, successful_annotations, extraction_method, nlp_preprocessing) add_failed_annotations_to_pdf_agentic(doc, failed_annotations) result_bytes = doc.tobytes() return result_bytes