regulens

Sleeping

App Files Files Community

amougou-fortiss commited on Jul 24, 2025

Commit

ce77033

verified ·

1 Parent(s): a7c8110

Upload 9 files

Browse files

Files changed (8) hide show

scripts/llm_nlp_preprocessing.py +128 -0
scripts/llm_no_nlp_preprocessing.py +104 -0
scripts/pdf_text_extractor.py +165 -0
scripts/pdfeditor.py +401 -0
scripts/pymupdf_nlp_preprocessing.py +157 -0
scripts/pymupdf_no_nlp_preprocessing.py +140 -0
scripts/text_extraction_landing_ai.py +57 -16
scripts/utility_functions.py +38 -1

scripts/llm_nlp_preprocessing.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import json
+import os
+from dotenv import load_dotenv
+from openai import OpenAI
+from tqdm import tqdm
+from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
+from scripts.utility_functions import call_nlp_service, render_prompt
+# Load environment variables from .env file
+load_dotenv()
+api_key = os.getenv("OPENAI_API_KEY")
+openai_client = OpenAI(api_key=api_key)
+def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
+    result = call_nlp_service({"text": text}, "preprocess_text_with_nlp_llm")
+    return result["chunks"], result["preprocessed_data"]
+def create_prompt(chunk, preprocessed_data):
+    return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
+def search_for_regulatory_changes(chunks, preprocessed_data, subtitle):
+    results = []
+    for chunk in chunks:
+        response = openai_client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
+                },
+                {"role": "user", "content": create_prompt(chunk, preprocessed_data)},
+            ],
+            temperature=0.7,
+            max_tokens=1024,
+        )
+        try:
+            result = json.loads(response.choices[0].message.content)
+            if result.get("changes_detected", False):
+                result["location"] = {"subtitle": subtitle}  # Use subtitle as location
+                result["source_text"] = chunk
+                results.append(result)
+        except json.JSONDecodeError:
+            continue
+    return results
+def detect_regulatory_changes(text_content, subtitle):
+    """
+    Main function to detect regulatory changes from text content.
+    Args:
+        text_content (str): The raw text content to analyze
+        subtitle (str): The subtitle associated with the content
+    Returns:
+        dict: Structured output containing detected changes and analysis summary
+    """
+    # Preprocess text with enhanced NLP
+    chunks, preprocessed_data = preprocess_text_with_nlp(text_content)
+    # Classify changes using NLP insights
+    results = search_for_regulatory_changes(chunks, preprocessed_data, subtitle)
+    return results
+def llm_regulatory_change_detector(hierarchical_structure):
+    if hierarchical_structure:
+        analysis_summary = {
+            "total_changes_detected": 0,
+            "changes_by_type": {"additions": 0, "deletions": 0, "modifications": 0},
+        }
+        subtitles = {}
+        # Iterate over sections and analyze content
+        for section in tqdm(
+            hierarchical_structure["sections"], desc="Analyzing Sections"
+        ):
+            subtitle = section["subtitle"]
+            content = section["content"]
+            if isinstance(content, list):
+                content = "\n".join(content)
+            # Detect changes for this subtitle
+            changes = detect_regulatory_changes(content, subtitle)
+            # Update analysis summary
+            for change in changes:
+                analysis_summary["total_changes_detected"] += len(
+                    change["classifications"]
+                )
+                for classification in change["classifications"]:
+                    change_type = classification["change_type"]
+                    analysis_summary["changes_by_type"][f"{change_type}s"] += 1
+            # Group changes by subtitle
+            subtitles[subtitle] = []
+            for change in changes:
+                for classification in change["classifications"]:
+                    change_subtype = (
+                        "context"
+                        if classification["change"] in CONTEXT_CATEGORIES
+                        else "scope"
+                    )
+                    subtitles[subtitle].append(
+                        {
+                            "change": classification["change"],
+                            "change_type": classification["change_type"],
+                            "change_subtype": change_subtype,
+                            "relevant_text": classification["relevant_text"],
+                            "explanation": classification["explanation"],
+                            "nlp_evidence": classification["evidence"],
+                        }
+                    )
+        # Combine analysis summary and grouped changes
+        final_output = {"analysis_summary": analysis_summary, "results": subtitles}
+        return final_output

scripts/llm_no_nlp_preprocessing.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import json
+import os
+from dotenv import load_dotenv
+from openai import OpenAI
+from tqdm import tqdm
+from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
+from scripts.utility_functions import render_prompt
+# Load environment variables from .env file
+load_dotenv()
+api_key = os.getenv("OPENAI_API_KEY")
+openai_client = OpenAI(api_key=api_key)
+def create_prompt_without_nlp_insights(text):
+    return render_prompt(text, include_nlp=False)
+def classify_changes_without_nlp_insights(text_content, subtitle):
+    """Classify changes in text chunks using OpenAI."""
+    chunks = text_content.split("\n\n")
+    results = []
+    for chunk in chunks:
+        response = openai_client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
+                },
+                {"role": "user", "content": create_prompt_without_nlp_insights(chunk)},
+            ],
+            temperature=0.7,
+            max_tokens=1024,
+        )
+        try:
+            result = json.loads(response.choices[0].message.content)
+            if result.get("changes_detected", False):
+                result["location"] = {"subtitle": subtitle}  # Use subtitle as location
+                result["source_text"] = chunk
+                results.append(result)
+        except json.JSONDecodeError:
+            continue
+    return results
+def llm_regulatory_change_detector_without_nlp_insights(hierarchical_structure):
+    if hierarchical_structure:
+        analysis_summary = {
+            "total_changes_detected": 0,
+            "changes_by_type": {"additions": 0, "deletions": 0, "modifications": 0},
+        }
+        subtitles = {}
+        # Iterate over sections and analyze content
+        for section in tqdm(
+            hierarchical_structure["sections"], desc="Analyzing Sections"
+        ):
+            subtitle = section["subtitle"]
+            content = section["content"]
+            if isinstance(content, list):
+                content = "\n".join(content)
+            # Detect changes for this subtitle
+            changes = classify_changes_without_nlp_insights(content, subtitle)
+            # Update analysis summary
+            for change in changes:
+                analysis_summary["total_changes_detected"] += len(
+                    change["classifications"]
+                )
+                for classification in change["classifications"]:
+                    change_type = classification["change_type"]
+                    analysis_summary["changes_by_type"][f"{change_type}s"] += 1
+            # Group changes by subtitle
+            subtitles[subtitle] = []
+            for change in changes:
+                for classification in change["classifications"]:
+                    change_subtype = (
+                        "context"
+                        if classification["change"] in CONTEXT_CATEGORIES
+                        else "scope"
+                    )
+                    subtitles[subtitle].append(
+                        {
+                            "change": classification["change"],
+                            "change_type": classification["change_type"],
+                            "change_subtype": change_subtype,
+                            "relevant_text": classification["relevant_text"],
+                            "explanation": classification["explanation"],
+                        }
+                    )
+        # Combine analysis summary and grouped changes
+        final_output = {"analysis_summary": analysis_summary, "results": subtitles}
+        return final_output

scripts/pdf_text_extractor.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import io
+import json
+import re
+import pdfplumber
+import pymupdf
+from dotenv import load_dotenv
+import os
+from openai import OpenAI
+# Load environment variables from .env file
+load_dotenv()
+api_key = os.getenv("OPENAI_API_KEY")
+openai_client = OpenAI(api_key=api_key)
+def create_hierarchical_structure_by_pymupdf(pdf_input: str | bytes):
+    """
+    Create a hierarchical structure of text blocks from a PDF file using PyMuPDF.
+    """
+    if isinstance(pdf_input, (str, os.PathLike)):
+        document = pymupdf.open(pdf_input)
+    elif isinstance(pdf_input, bytes):
+        document = pymupdf.open(stream=pdf_input, filetype="pdf")
+    else:
+        return {"blocks": []}
+    structured_data = {"blocks": []}
+    # Stack to keep track of hierarchical levels based on x0
+    hierarchy_stack = []
+    # Threshold for considering blocks at the same level
+    x0_threshold = 1.5
+    for page_num in range(len(document)):
+        page = document[page_num]
+        blocks = page.get_text("blocks")  # Extract text blocks
+        for block in blocks:
+            x0, y0, x1, y1, text, block_no, block_type = block
+            # Skip empty text blocks
+            if not text.strip():
+                continue
+            block_data = {
+                "page_number": page_num + 1,
+                "coordinates": {"x0": x0, "y0": y0, "x1": x1, "y1": y1},
+                "text": text.strip(),
+                "children": [],
+            }
+            # Determine the correct hierarchical level for the current block
+            while (
+                hierarchy_stack
+                and (x0 - hierarchy_stack[-1]["coordinates"]["x0"]) <= x0_threshold
+            ):
+                hierarchy_stack.pop()
+            if hierarchy_stack:
+                # Add the current block as a child of the last block in the stack
+                hierarchy_stack[-1]["children"].append(block_data)
+            else:
+                # If the stack is empty, add the block to the top level
+                structured_data["blocks"].append(block_data)
+            # Push the current block onto the stack
+            hierarchy_stack.append(block_data)
+    return structured_data
+def extract_text_from_pdf(pdf_input: str | bytes):
+    """Extract text from a PDF file."""
+    text = ""
+    with pdfplumber.open(
+        io.BytesIO(pdf_input)
+    ) as pdf:
+        for page in pdf.pages:
+            text += page.extract_text() + "\n"
+    return text
+def ask_openai_to_structure_text(text):
+    """Use OpenAI API to structure the text into a hierarchical format."""
+    prompt = f"""
+    Structure the following text into a hierarchical structure to diferentiate titles or subtitles from content.
+    The main goal is to associate a content to a title or subtitle.
+    Keep the same hierarchy of the text.
+    Dont summarize the text, just structure it.
+    Include all the pages of the text in the structure.
+    You have to return a JSON which always has the name of the keys of the example output even for documents with other formats.
+    Within the content key, you can have a list of strings representing the content
+    Ensure you return only a valid JSON.
+    Text:
+    {text}
+    Example Output:
+    {{
+        "title": "Main Title",
+        "sections": [
+            {{
+                "subtitle": "Subtitle 1",
+                "content": [
+                    "Content related to Subtitle 1.",
+                    "More content related to Subtitle 1."
+                ]
+            }},
+            {{
+                "subtitle": "Subtitle 2",
+                "content": [
+                    "Content related to Subtitle 2.",
+                    "More content related to Subtitle 2."
+                ]
+            }}
+        ]
+    }}
+    """
+    response = openai_client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful assistant that extract text from Pdf documents",
+            },
+            {"role": "user", "content": prompt},
+        ],
+    )
+    # Extract the content from the response
+    response_text = response.choices[0].message.content
+    # Remove Markdown code blocks (if present)
+    response_text = re.sub(r"```json|```", "", response_text).strip()
+    return response_text
+def create_hierarchical_structure_by_llm(pdf_input: str | bytes):
+    """Create a hierarchical structure for a PDF document from a path or bytes."""
+    # Step 1: Extract text from the PDF
+    if isinstance(pdf_input, (str, os.PathLike)) | isinstance(pdf_input, bytes):
+        text = extract_text_from_pdf(pdf_input)
+    else:
+        raise ValueError("pdf_input must be a file path or bytes.")
+    # Step 2: Ask OpenAI to structure the text
+    structured_text = ask_openai_to_structure_text(text)
+    # Step 3: Parse the structured text into a Python dictionary
+    try:
+        hierarchical_structure = json.loads(structured_text)
+    except json.JSONDecodeError as e:
+        print("Error parsing JSON response from OpenAI:", e)
+        print("Raw response:", structured_text)
+        return None
+    return hierarchical_structure

scripts/pdfeditor.py ADDED Viewed

	@@ -0,0 +1,401 @@

+import base64
+import pymupdf
+# from agentic_doc.parse import parse
+from scripts.llm_nlp_preprocessing import llm_regulatory_change_detector
+from scripts.llm_no_nlp_preprocessing import (
+    llm_regulatory_change_detector_without_nlp_insights,
+)
+from scripts.pymupdf_nlp_preprocessing import (
+    pymupdf_regulatory_change_detector_with_nlp_insights,
+)
+from scripts.pymupdf_no_nlp_preprocessing import (
+    pymupdf_regulatory_change_detector_without_nlp_insights,
+)
+from scripts.pdf_text_extractor import (
+    create_hierarchical_structure_by_llm,
+    create_hierarchical_structure_by_pymupdf,
+)
+# Define hex colors as RGB tuples (0–1 range)
+color_mapping = {
+    "addition": (0, 1, 0),  # green
+    "deletion": (1, 0, 0),  # red
+    "modification": (0, 0.6, 1),  # blue
+}
+def add_infos_to_pdf(doc, analysis_summary, extraction_method, do_nlp_preprocessing):
+    """
+    Doc is edited in place.
+    Adds metadata to the PDF document.
+    Adds a summary of the analysis to the first page of the PDF.
+    :param doc: The PyMuPDF document object.
+    :type doc: pymupdf.Document
+    :param analysis_summary: The summary of the analysis results.
+    :type analysis_summary: dict
+    :param extraction_method: The method used for text extraction from the PDF. Options are "PyMuPDF" or "LLM".
+    :type extraction_method: str
+    :param do_nlp_preprocessing: Flag indicating whether NLP preprocessing was used.
+    :type do_nlp_preprocessing: bool
+    """
+    changes_by_type = analysis_summary.get("changes_by_type", {})
+    additions = changes_by_type.get("addition") or changes_by_type.get("additions") or 0
+    deletions = changes_by_type.get("deletion") or changes_by_type.get("deletions") or 0
+    modifications = (
+        changes_by_type.get("modification") or changes_by_type.get("modifications") or 0
+    )
+    summary_text = (
+        "Regulatory Summary:\n"
+        f"- Extraction Method: {extraction_method}, NLP Preprocessing: {'yes' if do_nlp_preprocessing else 'no'}\n"
+        f"- Total Changes: {analysis_summary.get('total_changes_detected', '0')}, Successful Annotations: {analysis_summary.get('successful_annotations', '0')}\n"
+        f"- Additions: {additions}\n"
+        f"- Deletions: {deletions}\n"
+        f"- Modifications: {modifications}\n"
+    )
+    page = doc.load_page(0)
+    rect = pymupdf.Rect(10, 10, 550, 150)
+    page.insert_textbox(
+        rect,
+        summary_text,
+        fontsize=9,
+        fontname="helv",
+        align=pymupdf.TEXT_ALIGN_LEFT,
+        color=(0, 0, 0.7),
+        overlay=True,
+    )
+    metadata = doc.metadata
+    metadata["title"] = "Annotated " + (
+        metadata["title"] if metadata["title"] else "PDF"
+    )
+    metadata["author"] = "Fortiss Regulatory Change Detector" + (
+        " & " + metadata["author"] if metadata["author"] else ""
+    )
+    metadata["subject"] = "Annotated PDF with regulatory changes"
+    metadata["keywords"] = "regulatory, changes, annotations, pdf"
+    doc.set_metadata(metadata)
+def add_failed_annotations_to_pdf(doc, failed_annotations):
+    """
+    Doc is edited in place.
+    Adds failed annotations to the end of the PDF document.
+    :param doc: The PyMuPDF document object.
+    :type doc: pymupdf.Document
+    :param failed_annotations: The failed annotations to be added.
+    :type failed_annotations: array
+    """
+    if not failed_annotations:
+        return
+    page = doc.new_page(pno=-1)
+    annotation_str = "Failed Annotations:\n"
+    for failed_annotation in failed_annotations:
+        text = failed_annotation["change"]["relevant_text"]
+        change_type = failed_annotation["change"]["change_type"]
+        change_str = failed_annotation["change"]["change"]
+        page_num = failed_annotation["page"]
+        annotation_str += (
+            f"Page {page_num}: {text} ({change_type}) Change: {change_str}\n"
+        )
+    rect = pymupdf.Rect(20, 20, 580, 822)
+    page.insert_textbox(
+        rect,
+        annotation_str,
+        fontsize=9,
+        fontname="helv",
+        align=pymupdf.TEXT_ALIGN_LEFT,
+        color=(0, 0, 0.7),
+    )
+def get_data_dict_pymupdf(pdf_input: str, do_nlp_preprocessing: bool = True):
+    try:
+        pymupdf_structure = create_hierarchical_structure_by_pymupdf(pdf_input)
+    except Exception as e:
+        raise Exception(f"Error extracting text from PDF: {e}")
+    try:
+        if do_nlp_preprocessing:
+            data_dict, _ = pymupdf_regulatory_change_detector_with_nlp_insights(
+                pymupdf_structure
+            )
+        else:
+            data_dict, _ = pymupdf_regulatory_change_detector_without_nlp_insights(
+                pymupdf_structure
+            )
+        return data_dict
+    except Exception as e:
+        raise Exception(f"Error querying the pymupdf: {e}")
+def extract_document_pymupdf(uploaded_document: bytes, do_nlp_preprocessing=True):
+    data = get_data_dict_pymupdf(uploaded_document, do_nlp_preprocessing)
+    if not data:
+        return [], ""
+    flattened_changes = []
+    for page_num_str, changes in data.get("changes_by_page", {}).items():
+        for change in changes:
+            flattened_changes.append(
+                {
+                    "text": change.get("relevant_text", ""),
+                    "validated": False,
+                    "confirmed": False,
+                    "category": change.get("change", ""),
+                    "type": change.get("change_type", ""),
+                    "context": change.get("explanation", ""),
+                    "grounding": [{"page": int(page_num_str), "line": -1}],
+                }
+            )
+    markdown = "" # parse(uploaded_document.read())[0].model_dump_json().get("markdown", "")
+    return flattened_changes, markdown
+def pymupdf_pdf_annotator(pdf_path, do_nlp_preprocessing=True):
+    """
+    Annotates a PDF document by applying highlights and comments based on the changes
+    it gets from querying the llm with nlp preprocessing.
+    The text is extracted using PyMuPDF.
+    The annotations involve identifying specific text passages within the PDF and assigning an appropriate highlight color and comment
+    based on the change type (addition, deletion, or modification).
+    :param pdf_path: The file path to the PDF document that will be annotated.
+    :type pdf_path: str
+    :param do_nlp_preprocessing: Flag indicating whether to use NLP preprocessing for text extraction. Default is True.
+    :type do_nlp_preprocessing: bool
+    :return: Base64-encoded string of the annotated PDF document suitable for embedding in HTML.
+    :rtype: str
+    """
+    try:
+        doc = pymupdf.open(pdf_path)
+    except Exception as e:
+        raise Exception(f"Error opening PDF file: {e}")
+    data = get_data_dict_pymupdf(pdf_path, do_nlp_preprocessing)
+    if not data:
+        raise Exception("No data found in the PDF document. Please check the file.")
+    successful_annotations = 0
+    failed_annotations = []
+    for page_num_str, changes in data.get("changes_by_page", {}).items():
+        page_num = int(page_num_str)
+        doc_page = doc.load_page(page_num - 1)
+        # Sort by length of relevant_text in descending order to avoid overlapping highlights
+        changes = sorted(changes, key=lambda c: -len(c["relevant_text"]))
+        annotated_areas = []
+        for change in changes:
+            text = change["relevant_text"]
+            change_type = change["change_type"]
+            change_str = change["change"]
+            comment = change["explanation"]
+            # Search for the relevant text on the page
+            results = doc_page.search_for(text)
+            # we only want the results that do not overlap with already annotated areas
+            results = list(
+                filter(
+                    lambda result: not any(
+                        result.intersects(area) for area in annotated_areas
+                    ),
+                    results,
+                )
+            )
+            if not results:
+                print(
+                    f"No non-overlapping match found on page {page_num} for: '{text}'"
+                )
+                failed_annotations.append({"change": change, "page": page_num})
+                continue
+            color = color_mapping.get(change_type, (1, 1, 0))
+            annotated_areas.append(results[0])
+            highlight = doc_page.add_highlight_annot(results[0])
+            highlight.set_colors({"stroke": color})
+            highlight.set_info(
+                info={
+                    "title": "Comment",
+                    "content": f"{change_type} - {change_str}\n{comment}",
+                    "name": change_type,
+                }
+            )
+            highlight.update()
+            successful_annotations += 1
+            # if the resulting rects contain anything other than our search text we know it is a multiline highlight because for each line
+            # we will have a new result rect. We need to check if the text in the rect is not equal to our search text but is inside of it
+            # TODO test with multiple instances of multiline text on same page
+            for result in results[1:]:
+                resulttext = doc_page.get_textbox(result)
+                if (
+                    (resulttext.strip() != text.strip())
+                    & (resulttext.strip() in text.strip())
+                    & (not any(result.intersects(area) for area in annotated_areas))
+                ):
+                    highlight = doc_page.add_highlight_annot(result)
+                    highlight.set_colors({"stroke": color})
+                    highlight.update()
+                    annotated_areas.append(result)
+    data["analysis_summary"]["successful_annotations"] = successful_annotations
+    add_infos_to_pdf(doc, data["analysis_summary"], "PyMuPDF", do_nlp_preprocessing)
+    add_failed_annotations_to_pdf(doc, failed_annotations)
+    base64_pdf = base64.b64encode(doc.tobytes()).decode("utf-8")
+    doc.saveIncr()
+    doc.close()
+    return base64_pdf
+def extract_document_llm(uploaded_document: bytes, do_nlp_preprocessing=True):
+    try:
+        llm_structure = create_hierarchical_structure_by_llm(uploaded_document)
+    except Exception as e:
+        raise Exception(f"Error extracting text from PDF: {e}")
+    try:
+        if do_nlp_preprocessing:
+            data_dict = llm_regulatory_change_detector(llm_structure)
+        else:
+            data_dict = llm_regulatory_change_detector_without_nlp_insights(
+                llm_structure
+            )
+    except Exception as e:
+        raise Exception(f"Error querying the LLM: {e}")
+    data = data_dict
+    flattened_changes = []
+    for _, changes in data.get("results", {}).items():
+        for change in changes:
+            flattened_changes.append(
+                {
+                    "text": change.get("relevant_text", ""),
+                    "validated": False,
+                    "confirmed": False,
+                    "category": change.get("change", ""),
+                    "type": change.get("change_type", ""),
+                    "context": change.get("explanation", ""),
+                    "grounding": [{"page": -1, "line": -1}],
+                }
+            )
+    markdown = "" # parse(uploaded_document.read())[0].model_dump_json().get("markdown", "")
+    return flattened_changes, markdown
+def llm_pdf_annotator(pdf_path, do_nlp_preprocessing=True):
+    """
+    Annotates a PDF document by applying highlights and comments based on the changes
+    it gets from querying the llm with nlp preprocessing.
+    The text is extracted uing an LLM.
+    The annotations involve identifying specific text passages within the PDF and assigning an appropriate highlight color and comment
+    based on the change type (addition, deletion, or modification).
+    :param pdf_path: The file path to the PDF document that will be annotated.
+    :type pdf_path: str
+    :param do_nlp_preprocessing: Flag indicating whether to use NLP preprocessing for text extraction. Default is True.
+    :type do_nlp_preprocessing: bool
+    :return: Base64-encoded string of the annotated PDF document suitable for embedding in HTML.
+    :rtype: str
+    """
+    try:
+        doc = pymupdf.open(pdf_path)
+    except Exception as e:
+        raise Exception(f"Error opening PDF file: {e}")
+    try:
+        llm_structure = create_hierarchical_structure_by_llm(pdf_path)
+    except Exception as e:
+        raise Exception(f"Error extracting text from PDF: {e}")
+    try:
+        if do_nlp_preprocessing:
+            data_dict = llm_regulatory_change_detector(llm_structure)
+        else:
+            data_dict = llm_regulatory_change_detector_without_nlp_insights(
+                llm_structure
+            )
+    except Exception as e:
+        raise Exception(f"Error querying the LLM: {e}")
+    data = data_dict
+    successful_annotations = 0
+    failed_annotations = []
+    for _, changes in data.get("results", {}).items():
+        # Sort by length of relevant_text in descending order to avoid overlapping highlights
+        changes = sorted(changes, key=lambda c: -len(c["relevant_text"]))
+        annotated_areas = []
+        for change in changes:
+            text = change["relevant_text"]
+            change_type = change["change_type"]
+            comment = change["explanation"]
+            change_str = change["change"]
+            results = []
+            # search entire document for the text because we dont have the page index in the llm output
+            for page_num in range(len(doc)):
+                page = doc.load_page(page_num)
+                text_instances = page.search_for(text)
+                for inst in text_instances:
+                    results.append({"page": page_num, "bbox": inst})
+            # we only want the results that do not overlap with already annotated areas
+            results = list(
+                filter(
+                    lambda result: not any(
+                        result["bbox"].intersects(area) for area in annotated_areas
+                    ),
+                    results,
+                )
+            )
+            if not results:
+                print(
+                    f"No non-overlapping match found on page {page_num} for: '{text}'"
+                )
+                failed_annotations.append({"change": change, "page": page_num})
+                continue
+            color = color_mapping.get(change_type, (1, 1, 0))
+            ## we only want the first result because we will add highlights for each line of the multiline text
+            doc_page = doc.load_page(results[0]["page"])
+            bbox = results[0]["bbox"]
+            annotated_areas.append(bbox)
+            highlight = doc_page.add_highlight_annot(bbox)
+            highlight.set_colors({"stroke": color})
+            highlight.set_info(
+                info={
+                    "title": "Comment",
+                    "content": f"{change_type} - {change_str}\n{comment}",
+                    "name": change_type,
+                }
+            )
+            highlight.update()
+            successful_annotations += 1
+            # if the resulting rects contain anything other than our search text we know it is a multiline highlight because for each line
+            # we will have a new result rect. We need to check if the text in the rect is not equal to our search text but is inside of it
+            for result in results[1:]:
+                resulttext = doc_page.get_textbox(bbox)
+                if (
+                    (resulttext.strip() != text.strip())
+                    & (resulttext.strip() in text.strip())
+                    & (
+                        not any(
+                            result["bbox"].intersects(area) for area in annotated_areas
+                        )
+                    )
+                ):
+                    highlight = doc_page.add_highlight_annot(result["bbox"])
+                    highlight.set_colors({"stroke": color})
+                    highlight.update()
+                    annotated_areas.append(result["bbox"])
+    data["analysis_summary"]["successful_annotations"] = successful_annotations
+    add_infos_to_pdf(doc, data["analysis_summary"], "LLM", do_nlp_preprocessing)
+    add_failed_annotations_to_pdf(doc, failed_annotations)
+    base64_pdf = base64.b64encode(doc.tobytes()).decode("utf-8")
+    doc.saveIncr()
+    doc.close()
+    return base64_pdf

scripts/pymupdf_nlp_preprocessing.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import json
+import os
+from dotenv import load_dotenv
+from openai import OpenAI
+from tqdm import tqdm
+from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
+from scripts.utility_functions import call_nlp_service, render_prompt
+# Load environment variables from .env file
+load_dotenv()
+api_key = os.getenv("OPENAI_API_KEY")
+openai_client = OpenAI(api_key=api_key)
+def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
+    """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
+    return call_nlp_service({"text": text}, "preprocess_text_with_nlp_pymupdf")
+def create_prompt_with_nlp(chunk, preprocessed_data):
+    return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
+def classify_changes_with_nlp(text_content, location_info):
+    """Classify changes with NLP preprocessing."""
+    # Apply NLP preprocessing
+    preprocessed_data = preprocess_text_with_nlp(text_content)
+    # Split into chunks (using the same method as your first experiment)
+    result = call_nlp_service({"text": text_content}, "recursive_character_text_splitter")
+    chunks = result["chunks"]
+    results = []
+    for chunk in chunks:
+        response = openai_client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a legal expert analyzing German regulatory changes. Return only JSON.",
+                },
+                {
+                    "role": "user",
+                    "content": create_prompt_with_nlp(chunk, preprocessed_data),
+                },
+            ],
+            temperature=0.7,
+            max_tokens=1024,
+        )
+        try:
+            result = json.loads(response.choices[0].message.content)
+            if result.get("changes_detected", False):
+                result["location"] = location_info
+                result["source_text"] = chunk
+                results.append(result)
+        except json.JSONDecodeError:
+            continue
+    return results if results else None
+def extract_hierarchical_text(block):
+    """Extract text from a block including its parent and grandparent contexts."""
+    text_parts = []
+    # Check if block has a grandparent
+    if (
+        "parent" in block
+        and block["parent"] is not None
+        and "parent" in block["parent"]
+        and block["parent"]["parent"] is not None
+    ):
+        text_parts.append(block["parent"]["parent"]["text"])
+    # Check if block has a parent
+    if "parent" in block and block["parent"] is not None:
+        text_parts.append(block["parent"]["text"])
+    # Add the current block's text
+    text_parts.append(block["text"])
+    # Join all text parts with newlines between them
+    return "\n\n".join(text_parts)
+def traverse_blocks_with_nlp(blocks, parent=None, results=None, is_top_level=True):
+    """Traverse hierarchy with NLP-enhanced analysis."""
+    if results is None:
+        results = []
+    iterable = (
+        tqdm(blocks, desc="Processing Text blocks with NLP") if is_top_level else blocks
+    )
+    for block in iterable:
+        block["parent"] = parent
+        if "children" in block and not block["children"]:  # Leaf node
+            text_content = extract_hierarchical_text(block)
+            location_info = {
+                "page_number": block["page_number"],
+                "block_text": block["text"],
+            }
+            changes = classify_changes_with_nlp(text_content, location_info)
+            if changes:
+                for change in changes:
+                    change["full_text"] = text_content
+                    results.append(change)
+        else:
+            traverse_blocks_with_nlp(
+                block["children"], block, results, is_top_level=False
+            )
+    return results
+def pymupdf_regulatory_change_detector_with_nlp_insights(hierarchical_structure):
+    """Main function with NLP integration."""
+    if not hierarchical_structure:
+        return {"error": "No structure provided"}, []
+    analysis_summary = {
+        "total_changes_detected": 0,
+        "changes_by_type": {"addition": 0, "deletion": 0, "modification": 0},
+    }
+    changes_by_page = {}
+    results = traverse_blocks_with_nlp(hierarchical_structure["blocks"])
+    for change in results:
+        analysis_summary["total_changes_detected"] += len(change["classifications"])
+        for classification in change["classifications"]:
+            analysis_summary["changes_by_type"][classification["change_type"]] += 1
+            change_subtype = (
+                "context" if classification["change"] in CONTEXT_CATEGORIES else "scope"
+            )
+            page_num = change["location"]["page_number"]
+            changes_by_page.setdefault(page_num, []).append(
+                {
+                    "change": classification["change"],
+                    "change_type": classification["change_type"],
+                    "change_subtype": change_subtype,
+                    "relevant_text": classification["relevant_text"],
+                    "explanation": classification["explanation"],
+                    "nlp_evidence": classification["evidence"],
+                }
+            )
+    return {
+        "analysis_summary": analysis_summary,
+        "changes_by_page": changes_by_page,
+    }, results

scripts/pymupdf_no_nlp_preprocessing.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import json
+import os
+from dotenv import load_dotenv
+from openai import OpenAI
+from tqdm import tqdm
+from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
+from scripts.utility_functions import render_prompt
+from scripts.pymupdf_nlp_preprocessing import extract_hierarchical_text
+# Load environment variables from .env file
+load_dotenv()
+#nlp = spacy.load("de_core_news_sm")
+api_key = os.getenv("OPENAI_API_KEY")
+openai_client = OpenAI(api_key=api_key)
+def create_prompt_without_nlp_insights(text):
+    return render_prompt(text, include_nlp=False)
+def classify_changes_without_nlp_insights(text_content, location_info):
+    """Classify changes in text chunks using OpenAI."""
+    response = openai_client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
+            },
+            {
+                "role": "user",
+                "content": create_prompt_without_nlp_insights(text_content),
+            },
+        ],
+        temperature=0.7,
+        max_tokens=1024,
+    )
+    try:
+        result = json.loads(response.choices[0].message.content)
+        if result.get("changes_detected", False):
+            result["location"] = location_info
+            result["source_text"] = text_content
+            return result
+        return None
+    except json.JSONDecodeError:
+        return None
+def traverse_blocks(
+    blocks, parent=None, grandparent=None, results=None, is_top_level=True
+):
+    """Traverse the hierarchical structure in a depth-first manner and analyze leaf nodes."""
+    if results is None:
+        results = []
+    iterable = (
+        tqdm(blocks, desc="Processing Text blocks with NLP") if is_top_level else blocks
+    )
+    for block in iterable:
+        # Add parent and grandparent references to the block for context tracking
+        block["parent"] = parent
+        if "children" in block and (
+            not block["children"] or len(block["children"]) == 0
+        ):  # This is a leaf node
+            # Extract hierarchical text
+            text_content = extract_hierarchical_text(block)
+            # Define location info
+            location_info = {
+                "page_number": block["page_number"],
+                "block_text": block["text"],
+            }
+            # Analyze the text for changes
+            changes = classify_changes_without_nlp_insights(text_content, location_info)
+            if changes:
+                # Add the full hierarchical text to the result
+                changes["text"] = text_content
+                results.append(changes)
+        else:
+            traverse_blocks(
+                block["children"], block, parent, results, is_top_level=False
+            )
+    return results
+def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structure):
+    """Main function to detect regulatory changes in the hierarchical structure."""
+    if not hierarchical_structure:
+        return {"error": "No hierarchical structure provided"}
+    analysis_summary = {
+        "total_changes_detected": 0,
+        "changes_by_type": {"addition": 0, "deletion": 0, "modification": 0},
+    }
+    changes_by_page = {}
+    # Traverse the blocks and analyze leaf nodes
+    results = traverse_blocks(hierarchical_structure["blocks"])
+    # Update analysis summary
+    for change in results:
+        analysis_summary["total_changes_detected"] += len(change["classifications"])
+        for classification in change["classifications"]:
+            change_type = classification["change_type"]
+            analysis_summary["changes_by_type"][change_type] += 1
+            # Group changes by page number
+            page_number = change["location"]["page_number"]
+            if page_number not in changes_by_page:
+                changes_by_page[page_number] = []
+            change_subtype = (
+                "context" if classification["change"] in CONTEXT_CATEGORIES else "scope"
+            )
+            changes_by_page[page_number].append(
+                {
+                    "change": classification["change"],
+                    "change_type": classification["change_type"],
+                    "change_subtype": change_subtype,
+                    "relevant_text": classification["relevant_text"],
+                    "text": change["text"],
+                    "explanation": classification["explanation"],
+                }
+            )
+    # Combine analysis summary and grouped changes
+    final_output = {
+        "analysis_summary": analysis_summary,
+        "changes_by_page": changes_by_page,
+    }
+    return final_output, results

scripts/text_extraction_landing_ai.py CHANGED Viewed

@@ -2,27 +2,22 @@ import os
 import json
 import glob
 from agentic_doc.parse import parse
-from streamlit.runtime.uploaded_file_manager import UploadedFile
-def extract_document(
-    uploaded_document: UploadedFile, extraction_dir="text_extractions/"
-):
-    """
-    Extract text from documents if not already extracted.
-    Args:
-        uploaded_document: UploadedFile: The document to extract text from.
-        extraction_dir (str): Directory to store/check for extracted result
-    Returns:
-        dict: the json which we get from landing ai api
-    """
     # Ensure extraction directory exists
     os.makedirs(extraction_dir, exist_ok=True)
     # Get the base document name (without extension)
-    document_name = os.path.splitext(uploaded_document.name)[0]
     # Pattern to match existing extractions (e.g., "documentABC_*.json")
     existing_extraction_pattern = os.path.join(
@@ -39,9 +34,55 @@ def extract_document(
     else:
         try:
             print(f"No existing extraction found for {document_name}, calling API...")
-            result = parse(uploaded_document.read())
             print(f"Successfully extracted {document_name}")
         except Exception as e:
             print(f"Error extracting {document_name}: {str(e)}")
             result = {"status": "error", "error": str(e)}
-        return json.loads(result[0].model_dump_json())

 import json
 import glob
 from agentic_doc.parse import parse
+from scripts.pymupdf_nlp_preprocessing import classify_changes_with_nlp
+from scripts.pymupdf_no_nlp_preprocessing import classify_changes_without_nlp_insights
+def extract_document_agentic(
+    uploaded_document_name: str,
+    uploaded_document_bytes: bytes,
+    do_nlp_preprocessing=True,
+    extraction_dir="text_extractions/",
+):
     # Ensure extraction directory exists
     os.makedirs(extraction_dir, exist_ok=True)
     # Get the base document name (without extension)
+    document_name = os.path.splitext(uploaded_document_name)[0]
     # Pattern to match existing extractions (e.g., "documentABC_*.json")
     existing_extraction_pattern = os.path.join(
     else:
         try:
             print(f"No existing extraction found for {document_name}, calling API...")
+            result = json.loads(parse(uploaded_document_bytes)[0].model_dump_json())
             print(f"Successfully extracted {document_name}")
         except Exception as e:
             print(f"Error extracting {document_name}: {str(e)}")
             result = {"status": "error", "error": str(e)}
+            return result
+        if result:
+            if "chunks" in result and isinstance(result["chunks"], list):
+                for chunk in result["chunks"]:
+                    if do_nlp_preprocessing:
+                        classification_result = classify_changes_with_nlp(chunk["text"], "")
+                        # flatten into a single json element so it matches non-nlp part
+                        if classification_result and len(classification_result) > 0:
+                            flattened_classifications = {"changes_detected": classification_result[0].get("changes_detected", False), "classifications": []}
+                            for class_res in classification_result:
+                                if class_res.get("changes_detected", False):
+                                    flattened_classifications["classifications"].extend(class_res.get("classifications", []))
+                            classification_result = flattened_classifications
+                    else:
+                        classification_result = classify_changes_without_nlp_insights(
+                            chunk["text"], ""
+                        )
+                    if classification_result and classification_result.get(
+                        "changes_detected", False
+                    ):
+                        subchunks = []
+                        for subchunk in classification_result.get(
+                            "classifications", []
+                        ):
+                            subchunks.append(
+                                {
+                                    "text": subchunk.get("relevant_text", ""),
+                                    "validated": False,
+                                    "confirmed": False,
+                                    "category": subchunk.get("change", ""),
+                                    "type": subchunk.get("change_type", ""),
+                                    "context": subchunk.get("explanation", ""),
+                                }
+                            )
+                        chunk["subchunks"] = subchunks
+                    else:
+                        result["chunks"].remove(chunk)
+                # Create flattened list of subchunks for UI compatibility
+                flattened_changes = []
+                for chunk in result["chunks"]:
+                    if "subchunks" in chunk:
+                        for subchunk in chunk["subchunks"]:
+                            subchunk["grounding"] = chunk["grounding"]
+                            subchunk["grounding"][0]["line"] = -1
+                            subchunk["chunk_id"] = chunk["chunk_id"]
+                            flattened_changes.append(subchunk)
+        return flattened_changes, result.get("markdown", "")

scripts/utility_functions.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import json
 import re
 from rapidfuzz import fuzz
 from scripts.regulatory_change_foundation import (
     CLASSIFICATION_INFO,
     FEW_SHOT_EXAMPLES,
@@ -88,7 +89,7 @@ def highlight_nth(text, change, skip_failed=False):
 # TODO:check treshhold->51 would get always a result
 # if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing
-def highlight_fuzzy_match(text, change, n=0, threshold=86, skip_failed=False):
     target = change["text"]
     window_size = len(target)
     step = 1
@@ -123,6 +124,31 @@ def highlight_fuzzy_match(text, change, n=0, threshold=86, skip_failed=False):
     return text[:start_norm] + highlighted_span + text[end_norm:]
 def render_prompt(text, include_nlp=False, preprocessed_data=None):
     classification_json = json.dumps(CLASSIFICATION_INFO, indent=2)
     few_shot_json = json.dumps(FEW_SHOT_EXAMPLES, indent=2)
@@ -170,3 +196,14 @@ def save_json_to_file(data, output_dir, output_file):
     # Print the location of the saved file
     print(f"JSON data saved successfully at: {file_path}")

 import json
 import re
 from rapidfuzz import fuzz
+import requests
 from scripts.regulatory_change_foundation import (
     CLASSIFICATION_INFO,
     FEW_SHOT_EXAMPLES,
 # TODO:check treshhold->51 would get always a result
 # if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing
+def highlight_fuzzy_match(text, change, n=0, threshold=80, skip_failed=False):
     target = change["text"]
     window_size = len(target)
     step = 1
     return text[:start_norm] + highlighted_span + text[end_norm:]
+# TODO:check treshhold->51 would get always a result
+# if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing
+def get_best_fuzzy_match(text, change, threshold=65):
+    """Find the best fuzzy match for a change in the text and return the matched section"""
+    n = change.get("occurrence_index", 0)
+    target = change["text"]
+    window_size = len(target)
+    step = 1
+    candidates = []
+    for i in range(0, len(text) - window_size, step):
+        window = text[i : i + window_size]
+        score = fuzz.partial_ratio(window.lower(), target.lower())
+        if score >= threshold:
+            candidates.append((score, i, i + window_size))
+    if not candidates:
+        return None
+    # Pick top-N match
+    candidates.sort(reverse=True)
+    _, start_norm, end_norm = candidates[min(n, len(candidates) - 1)]
+    return text[start_norm:end_norm]
 def render_prompt(text, include_nlp=False, preprocessed_data=None):
     classification_json = json.dumps(CLASSIFICATION_INFO, indent=2)
     few_shot_json = json.dumps(FEW_SHOT_EXAMPLES, indent=2)
     # Print the location of the saved file
     print(f"JSON data saved successfully at: {file_path}")
+def call_nlp_service(payload, method):
+    url = f"https://amougou-fortiss-nlp-preprocessor.hf.space/{method}"
+    # Make the request
+    response = requests.post(url, data=payload)
+    if response.status_code == 200:
+        return response.json()
+    else:
+        raise Exception(f"NLP service error: {response.status_code} - {response.text}")