amougou-fortiss commited on
Commit
ce77033
·
verified ·
1 Parent(s): a7c8110

Upload 9 files

Browse files
scripts/llm_nlp_preprocessing.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from openai import OpenAI
5
+ from tqdm import tqdm
6
+ from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
7
+ from scripts.utility_functions import call_nlp_service, render_prompt
8
+
9
+
10
+ # Load environment variables from .env file
11
+ load_dotenv()
12
+
13
+ api_key = os.getenv("OPENAI_API_KEY")
14
+ openai_client = OpenAI(api_key=api_key)
15
+
16
+
17
+ def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
18
+ result = call_nlp_service({"text": text}, "preprocess_text_with_nlp_llm")
19
+ return result["chunks"], result["preprocessed_data"]
20
+
21
+
22
+ def create_prompt(chunk, preprocessed_data):
23
+ return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
24
+
25
+
26
+ def search_for_regulatory_changes(chunks, preprocessed_data, subtitle):
27
+ results = []
28
+
29
+ for chunk in chunks:
30
+ response = openai_client.chat.completions.create(
31
+ model="gpt-4o-mini",
32
+ messages=[
33
+ {
34
+ "role": "system",
35
+ "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
36
+ },
37
+ {"role": "user", "content": create_prompt(chunk, preprocessed_data)},
38
+ ],
39
+ temperature=0.7,
40
+ max_tokens=1024,
41
+ )
42
+
43
+ try:
44
+ result = json.loads(response.choices[0].message.content)
45
+ if result.get("changes_detected", False):
46
+ result["location"] = {"subtitle": subtitle} # Use subtitle as location
47
+ result["source_text"] = chunk
48
+ results.append(result)
49
+ except json.JSONDecodeError:
50
+ continue
51
+
52
+ return results
53
+
54
+
55
+ def detect_regulatory_changes(text_content, subtitle):
56
+ """
57
+ Main function to detect regulatory changes from text content.
58
+
59
+ Args:
60
+ text_content (str): The raw text content to analyze
61
+ subtitle (str): The subtitle associated with the content
62
+
63
+ Returns:
64
+ dict: Structured output containing detected changes and analysis summary
65
+ """
66
+
67
+ # Preprocess text with enhanced NLP
68
+ chunks, preprocessed_data = preprocess_text_with_nlp(text_content)
69
+
70
+ # Classify changes using NLP insights
71
+ results = search_for_regulatory_changes(chunks, preprocessed_data, subtitle)
72
+
73
+ return results
74
+
75
+
76
+ def llm_regulatory_change_detector(hierarchical_structure):
77
+ if hierarchical_structure:
78
+ analysis_summary = {
79
+ "total_changes_detected": 0,
80
+ "changes_by_type": {"additions": 0, "deletions": 0, "modifications": 0},
81
+ }
82
+ subtitles = {}
83
+
84
+ # Iterate over sections and analyze content
85
+ for section in tqdm(
86
+ hierarchical_structure["sections"], desc="Analyzing Sections"
87
+ ):
88
+ subtitle = section["subtitle"]
89
+ content = section["content"]
90
+ if isinstance(content, list):
91
+ content = "\n".join(content)
92
+
93
+ # Detect changes for this subtitle
94
+ changes = detect_regulatory_changes(content, subtitle)
95
+
96
+ # Update analysis summary
97
+ for change in changes:
98
+ analysis_summary["total_changes_detected"] += len(
99
+ change["classifications"]
100
+ )
101
+ for classification in change["classifications"]:
102
+ change_type = classification["change_type"]
103
+ analysis_summary["changes_by_type"][f"{change_type}s"] += 1
104
+
105
+ # Group changes by subtitle
106
+ subtitles[subtitle] = []
107
+ for change in changes:
108
+ for classification in change["classifications"]:
109
+ change_subtype = (
110
+ "context"
111
+ if classification["change"] in CONTEXT_CATEGORIES
112
+ else "scope"
113
+ )
114
+ subtitles[subtitle].append(
115
+ {
116
+ "change": classification["change"],
117
+ "change_type": classification["change_type"],
118
+ "change_subtype": change_subtype,
119
+ "relevant_text": classification["relevant_text"],
120
+ "explanation": classification["explanation"],
121
+ "nlp_evidence": classification["evidence"],
122
+ }
123
+ )
124
+
125
+ # Combine analysis summary and grouped changes
126
+ final_output = {"analysis_summary": analysis_summary, "results": subtitles}
127
+
128
+ return final_output
scripts/llm_no_nlp_preprocessing.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from openai import OpenAI
5
+ from tqdm import tqdm
6
+ from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
7
+ from scripts.utility_functions import render_prompt
8
+
9
+
10
+ # Load environment variables from .env file
11
+ load_dotenv()
12
+
13
+ api_key = os.getenv("OPENAI_API_KEY")
14
+ openai_client = OpenAI(api_key=api_key)
15
+
16
+
17
+ def create_prompt_without_nlp_insights(text):
18
+ return render_prompt(text, include_nlp=False)
19
+
20
+
21
+ def classify_changes_without_nlp_insights(text_content, subtitle):
22
+ """Classify changes in text chunks using OpenAI."""
23
+
24
+ chunks = text_content.split("\n\n")
25
+ results = []
26
+
27
+ for chunk in chunks:
28
+ response = openai_client.chat.completions.create(
29
+ model="gpt-4o-mini",
30
+ messages=[
31
+ {
32
+ "role": "system",
33
+ "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
34
+ },
35
+ {"role": "user", "content": create_prompt_without_nlp_insights(chunk)},
36
+ ],
37
+ temperature=0.7,
38
+ max_tokens=1024,
39
+ )
40
+
41
+ try:
42
+ result = json.loads(response.choices[0].message.content)
43
+ if result.get("changes_detected", False):
44
+ result["location"] = {"subtitle": subtitle} # Use subtitle as location
45
+ result["source_text"] = chunk
46
+ results.append(result)
47
+ except json.JSONDecodeError:
48
+ continue
49
+
50
+ return results
51
+
52
+
53
+ def llm_regulatory_change_detector_without_nlp_insights(hierarchical_structure):
54
+ if hierarchical_structure:
55
+ analysis_summary = {
56
+ "total_changes_detected": 0,
57
+ "changes_by_type": {"additions": 0, "deletions": 0, "modifications": 0},
58
+ }
59
+ subtitles = {}
60
+
61
+ # Iterate over sections and analyze content
62
+ for section in tqdm(
63
+ hierarchical_structure["sections"], desc="Analyzing Sections"
64
+ ):
65
+ subtitle = section["subtitle"]
66
+ content = section["content"]
67
+ if isinstance(content, list):
68
+ content = "\n".join(content)
69
+
70
+ # Detect changes for this subtitle
71
+ changes = classify_changes_without_nlp_insights(content, subtitle)
72
+
73
+ # Update analysis summary
74
+ for change in changes:
75
+ analysis_summary["total_changes_detected"] += len(
76
+ change["classifications"]
77
+ )
78
+ for classification in change["classifications"]:
79
+ change_type = classification["change_type"]
80
+ analysis_summary["changes_by_type"][f"{change_type}s"] += 1
81
+
82
+ # Group changes by subtitle
83
+ subtitles[subtitle] = []
84
+ for change in changes:
85
+ for classification in change["classifications"]:
86
+ change_subtype = (
87
+ "context"
88
+ if classification["change"] in CONTEXT_CATEGORIES
89
+ else "scope"
90
+ )
91
+ subtitles[subtitle].append(
92
+ {
93
+ "change": classification["change"],
94
+ "change_type": classification["change_type"],
95
+ "change_subtype": change_subtype,
96
+ "relevant_text": classification["relevant_text"],
97
+ "explanation": classification["explanation"],
98
+ }
99
+ )
100
+
101
+ # Combine analysis summary and grouped changes
102
+ final_output = {"analysis_summary": analysis_summary, "results": subtitles}
103
+
104
+ return final_output
scripts/pdf_text_extractor.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import re
4
+ import pdfplumber
5
+ import pymupdf
6
+ from dotenv import load_dotenv
7
+ import os
8
+ from openai import OpenAI
9
+
10
+ # Load environment variables from .env file
11
+ load_dotenv()
12
+
13
+ api_key = os.getenv("OPENAI_API_KEY")
14
+ openai_client = OpenAI(api_key=api_key)
15
+
16
+
17
+ def create_hierarchical_structure_by_pymupdf(pdf_input: str | bytes):
18
+ """
19
+ Create a hierarchical structure of text blocks from a PDF file using PyMuPDF.
20
+ """
21
+ if isinstance(pdf_input, (str, os.PathLike)):
22
+ document = pymupdf.open(pdf_input)
23
+ elif isinstance(pdf_input, bytes):
24
+ document = pymupdf.open(stream=pdf_input, filetype="pdf")
25
+ else:
26
+ return {"blocks": []}
27
+
28
+ structured_data = {"blocks": []}
29
+
30
+ # Stack to keep track of hierarchical levels based on x0
31
+ hierarchy_stack = []
32
+
33
+ # Threshold for considering blocks at the same level
34
+ x0_threshold = 1.5
35
+
36
+ for page_num in range(len(document)):
37
+ page = document[page_num]
38
+ blocks = page.get_text("blocks") # Extract text blocks
39
+
40
+ for block in blocks:
41
+ x0, y0, x1, y1, text, block_no, block_type = block
42
+
43
+ # Skip empty text blocks
44
+ if not text.strip():
45
+ continue
46
+
47
+ block_data = {
48
+ "page_number": page_num + 1,
49
+ "coordinates": {"x0": x0, "y0": y0, "x1": x1, "y1": y1},
50
+ "text": text.strip(),
51
+ "children": [],
52
+ }
53
+
54
+ # Determine the correct hierarchical level for the current block
55
+ while (
56
+ hierarchy_stack
57
+ and (x0 - hierarchy_stack[-1]["coordinates"]["x0"]) <= x0_threshold
58
+ ):
59
+ hierarchy_stack.pop()
60
+
61
+ if hierarchy_stack:
62
+ # Add the current block as a child of the last block in the stack
63
+ hierarchy_stack[-1]["children"].append(block_data)
64
+ else:
65
+ # If the stack is empty, add the block to the top level
66
+ structured_data["blocks"].append(block_data)
67
+
68
+ # Push the current block onto the stack
69
+ hierarchy_stack.append(block_data)
70
+
71
+ return structured_data
72
+
73
+
74
+ def extract_text_from_pdf(pdf_input: str | bytes):
75
+ """Extract text from a PDF file."""
76
+
77
+ text = ""
78
+ with pdfplumber.open(
79
+ io.BytesIO(pdf_input)
80
+ ) as pdf:
81
+ for page in pdf.pages:
82
+ text += page.extract_text() + "\n"
83
+ return text
84
+
85
+
86
+ def ask_openai_to_structure_text(text):
87
+ """Use OpenAI API to structure the text into a hierarchical format."""
88
+
89
+ prompt = f"""
90
+ Structure the following text into a hierarchical structure to diferentiate titles or subtitles from content.
91
+ The main goal is to associate a content to a title or subtitle.
92
+ Keep the same hierarchy of the text.
93
+ Dont summarize the text, just structure it.
94
+ Include all the pages of the text in the structure.
95
+ You have to return a JSON which always has the name of the keys of the example output even for documents with other formats.
96
+ Within the content key, you can have a list of strings representing the content
97
+ Ensure you return only a valid JSON.
98
+
99
+ Text:
100
+ {text}
101
+
102
+ Example Output:
103
+ {{
104
+ "title": "Main Title",
105
+ "sections": [
106
+ {{
107
+ "subtitle": "Subtitle 1",
108
+ "content": [
109
+ "Content related to Subtitle 1.",
110
+ "More content related to Subtitle 1."
111
+ ]
112
+ }},
113
+ {{
114
+ "subtitle": "Subtitle 2",
115
+ "content": [
116
+ "Content related to Subtitle 2.",
117
+ "More content related to Subtitle 2."
118
+ ]
119
+
120
+ }}
121
+ ]
122
+ }}
123
+ """
124
+
125
+ response = openai_client.chat.completions.create(
126
+ model="gpt-4o-mini",
127
+ messages=[
128
+ {
129
+ "role": "system",
130
+ "content": "You are a helpful assistant that extract text from Pdf documents",
131
+ },
132
+ {"role": "user", "content": prompt},
133
+ ],
134
+ )
135
+
136
+ # Extract the content from the response
137
+ response_text = response.choices[0].message.content
138
+
139
+ # Remove Markdown code blocks (if present)
140
+ response_text = re.sub(r"```json|```", "", response_text).strip()
141
+
142
+ return response_text
143
+
144
+
145
+ def create_hierarchical_structure_by_llm(pdf_input: str | bytes):
146
+ """Create a hierarchical structure for a PDF document from a path or bytes."""
147
+
148
+ # Step 1: Extract text from the PDF
149
+ if isinstance(pdf_input, (str, os.PathLike)) | isinstance(pdf_input, bytes):
150
+ text = extract_text_from_pdf(pdf_input)
151
+ else:
152
+ raise ValueError("pdf_input must be a file path or bytes.")
153
+
154
+ # Step 2: Ask OpenAI to structure the text
155
+ structured_text = ask_openai_to_structure_text(text)
156
+
157
+ # Step 3: Parse the structured text into a Python dictionary
158
+ try:
159
+ hierarchical_structure = json.loads(structured_text)
160
+ except json.JSONDecodeError as e:
161
+ print("Error parsing JSON response from OpenAI:", e)
162
+ print("Raw response:", structured_text)
163
+ return None
164
+
165
+ return hierarchical_structure
scripts/pdfeditor.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import pymupdf
3
+
4
+ # from agentic_doc.parse import parse
5
+ from scripts.llm_nlp_preprocessing import llm_regulatory_change_detector
6
+ from scripts.llm_no_nlp_preprocessing import (
7
+ llm_regulatory_change_detector_without_nlp_insights,
8
+ )
9
+ from scripts.pymupdf_nlp_preprocessing import (
10
+ pymupdf_regulatory_change_detector_with_nlp_insights,
11
+ )
12
+ from scripts.pymupdf_no_nlp_preprocessing import (
13
+ pymupdf_regulatory_change_detector_without_nlp_insights,
14
+ )
15
+ from scripts.pdf_text_extractor import (
16
+ create_hierarchical_structure_by_llm,
17
+ create_hierarchical_structure_by_pymupdf,
18
+ )
19
+
20
+
21
+ # Define hex colors as RGB tuples (0–1 range)
22
+ color_mapping = {
23
+ "addition": (0, 1, 0), # green
24
+ "deletion": (1, 0, 0), # red
25
+ "modification": (0, 0.6, 1), # blue
26
+ }
27
+
28
+
29
+ def add_infos_to_pdf(doc, analysis_summary, extraction_method, do_nlp_preprocessing):
30
+ """
31
+ Doc is edited in place.
32
+ Adds metadata to the PDF document.
33
+ Adds a summary of the analysis to the first page of the PDF.
34
+
35
+ :param doc: The PyMuPDF document object.
36
+ :type doc: pymupdf.Document
37
+ :param analysis_summary: The summary of the analysis results.
38
+ :type analysis_summary: dict
39
+ :param extraction_method: The method used for text extraction from the PDF. Options are "PyMuPDF" or "LLM".
40
+ :type extraction_method: str
41
+ :param do_nlp_preprocessing: Flag indicating whether NLP preprocessing was used.
42
+ :type do_nlp_preprocessing: bool
43
+ """
44
+ changes_by_type = analysis_summary.get("changes_by_type", {})
45
+
46
+ additions = changes_by_type.get("addition") or changes_by_type.get("additions") or 0
47
+ deletions = changes_by_type.get("deletion") or changes_by_type.get("deletions") or 0
48
+ modifications = (
49
+ changes_by_type.get("modification") or changes_by_type.get("modifications") or 0
50
+ )
51
+
52
+ summary_text = (
53
+ "Regulatory Summary:\n"
54
+ f"- Extraction Method: {extraction_method}, NLP Preprocessing: {'yes' if do_nlp_preprocessing else 'no'}\n"
55
+ f"- Total Changes: {analysis_summary.get('total_changes_detected', '0')}, Successful Annotations: {analysis_summary.get('successful_annotations', '0')}\n"
56
+ f"- Additions: {additions}\n"
57
+ f"- Deletions: {deletions}\n"
58
+ f"- Modifications: {modifications}\n"
59
+ )
60
+ page = doc.load_page(0)
61
+ rect = pymupdf.Rect(10, 10, 550, 150)
62
+ page.insert_textbox(
63
+ rect,
64
+ summary_text,
65
+ fontsize=9,
66
+ fontname="helv",
67
+ align=pymupdf.TEXT_ALIGN_LEFT,
68
+ color=(0, 0, 0.7),
69
+ overlay=True,
70
+ )
71
+
72
+ metadata = doc.metadata
73
+ metadata["title"] = "Annotated " + (
74
+ metadata["title"] if metadata["title"] else "PDF"
75
+ )
76
+ metadata["author"] = "Fortiss Regulatory Change Detector" + (
77
+ " & " + metadata["author"] if metadata["author"] else ""
78
+ )
79
+ metadata["subject"] = "Annotated PDF with regulatory changes"
80
+ metadata["keywords"] = "regulatory, changes, annotations, pdf"
81
+ doc.set_metadata(metadata)
82
+
83
+
84
+ def add_failed_annotations_to_pdf(doc, failed_annotations):
85
+ """
86
+ Doc is edited in place.
87
+ Adds failed annotations to the end of the PDF document.
88
+
89
+ :param doc: The PyMuPDF document object.
90
+ :type doc: pymupdf.Document
91
+ :param failed_annotations: The failed annotations to be added.
92
+ :type failed_annotations: array
93
+ """
94
+ if not failed_annotations:
95
+ return
96
+ page = doc.new_page(pno=-1)
97
+ annotation_str = "Failed Annotations:\n"
98
+ for failed_annotation in failed_annotations:
99
+ text = failed_annotation["change"]["relevant_text"]
100
+ change_type = failed_annotation["change"]["change_type"]
101
+ change_str = failed_annotation["change"]["change"]
102
+ page_num = failed_annotation["page"]
103
+ annotation_str += (
104
+ f"Page {page_num}: {text} ({change_type}) Change: {change_str}\n"
105
+ )
106
+
107
+ rect = pymupdf.Rect(20, 20, 580, 822)
108
+ page.insert_textbox(
109
+ rect,
110
+ annotation_str,
111
+ fontsize=9,
112
+ fontname="helv",
113
+ align=pymupdf.TEXT_ALIGN_LEFT,
114
+ color=(0, 0, 0.7),
115
+ )
116
+
117
+
118
+ def get_data_dict_pymupdf(pdf_input: str, do_nlp_preprocessing: bool = True):
119
+ try:
120
+ pymupdf_structure = create_hierarchical_structure_by_pymupdf(pdf_input)
121
+ except Exception as e:
122
+ raise Exception(f"Error extracting text from PDF: {e}")
123
+ try:
124
+ if do_nlp_preprocessing:
125
+ data_dict, _ = pymupdf_regulatory_change_detector_with_nlp_insights(
126
+ pymupdf_structure
127
+ )
128
+ else:
129
+ data_dict, _ = pymupdf_regulatory_change_detector_without_nlp_insights(
130
+ pymupdf_structure
131
+ )
132
+ return data_dict
133
+ except Exception as e:
134
+ raise Exception(f"Error querying the pymupdf: {e}")
135
+
136
+
137
+ def extract_document_pymupdf(uploaded_document: bytes, do_nlp_preprocessing=True):
138
+ data = get_data_dict_pymupdf(uploaded_document, do_nlp_preprocessing)
139
+ if not data:
140
+ return [], ""
141
+ flattened_changes = []
142
+ for page_num_str, changes in data.get("changes_by_page", {}).items():
143
+ for change in changes:
144
+ flattened_changes.append(
145
+ {
146
+ "text": change.get("relevant_text", ""),
147
+ "validated": False,
148
+ "confirmed": False,
149
+ "category": change.get("change", ""),
150
+ "type": change.get("change_type", ""),
151
+ "context": change.get("explanation", ""),
152
+ "grounding": [{"page": int(page_num_str), "line": -1}],
153
+ }
154
+ )
155
+ markdown = "" # parse(uploaded_document.read())[0].model_dump_json().get("markdown", "")
156
+ return flattened_changes, markdown
157
+
158
+
159
+ def pymupdf_pdf_annotator(pdf_path, do_nlp_preprocessing=True):
160
+ """
161
+ Annotates a PDF document by applying highlights and comments based on the changes
162
+ it gets from querying the llm with nlp preprocessing.
163
+ The text is extracted using PyMuPDF.
164
+ The annotations involve identifying specific text passages within the PDF and assigning an appropriate highlight color and comment
165
+ based on the change type (addition, deletion, or modification).
166
+
167
+ :param pdf_path: The file path to the PDF document that will be annotated.
168
+ :type pdf_path: str
169
+ :param do_nlp_preprocessing: Flag indicating whether to use NLP preprocessing for text extraction. Default is True.
170
+ :type do_nlp_preprocessing: bool
171
+
172
+ :return: Base64-encoded string of the annotated PDF document suitable for embedding in HTML.
173
+ :rtype: str
174
+ """
175
+ try:
176
+ doc = pymupdf.open(pdf_path)
177
+ except Exception as e:
178
+ raise Exception(f"Error opening PDF file: {e}")
179
+ data = get_data_dict_pymupdf(pdf_path, do_nlp_preprocessing)
180
+ if not data:
181
+ raise Exception("No data found in the PDF document. Please check the file.")
182
+ successful_annotations = 0
183
+ failed_annotations = []
184
+
185
+ for page_num_str, changes in data.get("changes_by_page", {}).items():
186
+ page_num = int(page_num_str)
187
+ doc_page = doc.load_page(page_num - 1)
188
+ # Sort by length of relevant_text in descending order to avoid overlapping highlights
189
+ changes = sorted(changes, key=lambda c: -len(c["relevant_text"]))
190
+ annotated_areas = []
191
+
192
+ for change in changes:
193
+ text = change["relevant_text"]
194
+ change_type = change["change_type"]
195
+ change_str = change["change"]
196
+ comment = change["explanation"]
197
+
198
+ # Search for the relevant text on the page
199
+ results = doc_page.search_for(text)
200
+ # we only want the results that do not overlap with already annotated areas
201
+ results = list(
202
+ filter(
203
+ lambda result: not any(
204
+ result.intersects(area) for area in annotated_areas
205
+ ),
206
+ results,
207
+ )
208
+ )
209
+ if not results:
210
+ print(
211
+ f"No non-overlapping match found on page {page_num} for: '{text}'"
212
+ )
213
+ failed_annotations.append({"change": change, "page": page_num})
214
+ continue
215
+
216
+ color = color_mapping.get(change_type, (1, 1, 0))
217
+
218
+ annotated_areas.append(results[0])
219
+ highlight = doc_page.add_highlight_annot(results[0])
220
+ highlight.set_colors({"stroke": color})
221
+ highlight.set_info(
222
+ info={
223
+ "title": "Comment",
224
+ "content": f"{change_type} - {change_str}\n{comment}",
225
+ "name": change_type,
226
+ }
227
+ )
228
+ highlight.update()
229
+ successful_annotations += 1
230
+
231
+ # if the resulting rects contain anything other than our search text we know it is a multiline highlight because for each line
232
+ # we will have a new result rect. We need to check if the text in the rect is not equal to our search text but is inside of it
233
+ # TODO test with multiple instances of multiline text on same page
234
+ for result in results[1:]:
235
+ resulttext = doc_page.get_textbox(result)
236
+ if (
237
+ (resulttext.strip() != text.strip())
238
+ & (resulttext.strip() in text.strip())
239
+ & (not any(result.intersects(area) for area in annotated_areas))
240
+ ):
241
+ highlight = doc_page.add_highlight_annot(result)
242
+ highlight.set_colors({"stroke": color})
243
+ highlight.update()
244
+ annotated_areas.append(result)
245
+
246
+ data["analysis_summary"]["successful_annotations"] = successful_annotations
247
+ add_infos_to_pdf(doc, data["analysis_summary"], "PyMuPDF", do_nlp_preprocessing)
248
+ add_failed_annotations_to_pdf(doc, failed_annotations)
249
+ base64_pdf = base64.b64encode(doc.tobytes()).decode("utf-8")
250
+ doc.saveIncr()
251
+ doc.close()
252
+ return base64_pdf
253
+
254
+
255
+ def extract_document_llm(uploaded_document: bytes, do_nlp_preprocessing=True):
256
+ try:
257
+ llm_structure = create_hierarchical_structure_by_llm(uploaded_document)
258
+ except Exception as e:
259
+ raise Exception(f"Error extracting text from PDF: {e}")
260
+ try:
261
+ if do_nlp_preprocessing:
262
+ data_dict = llm_regulatory_change_detector(llm_structure)
263
+ else:
264
+ data_dict = llm_regulatory_change_detector_without_nlp_insights(
265
+ llm_structure
266
+ )
267
+ except Exception as e:
268
+ raise Exception(f"Error querying the LLM: {e}")
269
+ data = data_dict
270
+ flattened_changes = []
271
+ for _, changes in data.get("results", {}).items():
272
+ for change in changes:
273
+ flattened_changes.append(
274
+ {
275
+ "text": change.get("relevant_text", ""),
276
+ "validated": False,
277
+ "confirmed": False,
278
+ "category": change.get("change", ""),
279
+ "type": change.get("change_type", ""),
280
+ "context": change.get("explanation", ""),
281
+ "grounding": [{"page": -1, "line": -1}],
282
+ }
283
+ )
284
+ markdown = "" # parse(uploaded_document.read())[0].model_dump_json().get("markdown", "")
285
+ return flattened_changes, markdown
286
+
287
+
288
+ def llm_pdf_annotator(pdf_path, do_nlp_preprocessing=True):
289
+ """
290
+ Annotates a PDF document by applying highlights and comments based on the changes
291
+ it gets from querying the llm with nlp preprocessing.
292
+ The text is extracted uing an LLM.
293
+ The annotations involve identifying specific text passages within the PDF and assigning an appropriate highlight color and comment
294
+ based on the change type (addition, deletion, or modification).
295
+
296
+ :param pdf_path: The file path to the PDF document that will be annotated.
297
+ :type pdf_path: str
298
+ :param do_nlp_preprocessing: Flag indicating whether to use NLP preprocessing for text extraction. Default is True.
299
+ :type do_nlp_preprocessing: bool
300
+
301
+ :return: Base64-encoded string of the annotated PDF document suitable for embedding in HTML.
302
+ :rtype: str
303
+ """
304
+ try:
305
+ doc = pymupdf.open(pdf_path)
306
+ except Exception as e:
307
+ raise Exception(f"Error opening PDF file: {e}")
308
+
309
+ try:
310
+ llm_structure = create_hierarchical_structure_by_llm(pdf_path)
311
+ except Exception as e:
312
+ raise Exception(f"Error extracting text from PDF: {e}")
313
+ try:
314
+ if do_nlp_preprocessing:
315
+ data_dict = llm_regulatory_change_detector(llm_structure)
316
+ else:
317
+ data_dict = llm_regulatory_change_detector_without_nlp_insights(
318
+ llm_structure
319
+ )
320
+ except Exception as e:
321
+ raise Exception(f"Error querying the LLM: {e}")
322
+ data = data_dict
323
+ successful_annotations = 0
324
+ failed_annotations = []
325
+
326
+ for _, changes in data.get("results", {}).items():
327
+ # Sort by length of relevant_text in descending order to avoid overlapping highlights
328
+ changes = sorted(changes, key=lambda c: -len(c["relevant_text"]))
329
+ annotated_areas = []
330
+
331
+ for change in changes:
332
+ text = change["relevant_text"]
333
+ change_type = change["change_type"]
334
+ comment = change["explanation"]
335
+ change_str = change["change"]
336
+ results = []
337
+ # search entire document for the text because we dont have the page index in the llm output
338
+ for page_num in range(len(doc)):
339
+ page = doc.load_page(page_num)
340
+ text_instances = page.search_for(text)
341
+
342
+ for inst in text_instances:
343
+ results.append({"page": page_num, "bbox": inst})
344
+ # we only want the results that do not overlap with already annotated areas
345
+ results = list(
346
+ filter(
347
+ lambda result: not any(
348
+ result["bbox"].intersects(area) for area in annotated_areas
349
+ ),
350
+ results,
351
+ )
352
+ )
353
+ if not results:
354
+ print(
355
+ f"No non-overlapping match found on page {page_num} for: '{text}'"
356
+ )
357
+ failed_annotations.append({"change": change, "page": page_num})
358
+ continue
359
+
360
+ color = color_mapping.get(change_type, (1, 1, 0))
361
+ ## we only want the first result because we will add highlights for each line of the multiline text
362
+ doc_page = doc.load_page(results[0]["page"])
363
+ bbox = results[0]["bbox"]
364
+ annotated_areas.append(bbox)
365
+ highlight = doc_page.add_highlight_annot(bbox)
366
+ highlight.set_colors({"stroke": color})
367
+ highlight.set_info(
368
+ info={
369
+ "title": "Comment",
370
+ "content": f"{change_type} - {change_str}\n{comment}",
371
+ "name": change_type,
372
+ }
373
+ )
374
+ highlight.update()
375
+ successful_annotations += 1
376
+
377
+ # if the resulting rects contain anything other than our search text we know it is a multiline highlight because for each line
378
+ # we will have a new result rect. We need to check if the text in the rect is not equal to our search text but is inside of it
379
+ for result in results[1:]:
380
+ resulttext = doc_page.get_textbox(bbox)
381
+ if (
382
+ (resulttext.strip() != text.strip())
383
+ & (resulttext.strip() in text.strip())
384
+ & (
385
+ not any(
386
+ result["bbox"].intersects(area) for area in annotated_areas
387
+ )
388
+ )
389
+ ):
390
+ highlight = doc_page.add_highlight_annot(result["bbox"])
391
+ highlight.set_colors({"stroke": color})
392
+ highlight.update()
393
+ annotated_areas.append(result["bbox"])
394
+
395
+ data["analysis_summary"]["successful_annotations"] = successful_annotations
396
+ add_infos_to_pdf(doc, data["analysis_summary"], "LLM", do_nlp_preprocessing)
397
+ add_failed_annotations_to_pdf(doc, failed_annotations)
398
+ base64_pdf = base64.b64encode(doc.tobytes()).decode("utf-8")
399
+ doc.saveIncr()
400
+ doc.close()
401
+ return base64_pdf
scripts/pymupdf_nlp_preprocessing.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from openai import OpenAI
5
+ from tqdm import tqdm
6
+ from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
7
+ from scripts.utility_functions import call_nlp_service, render_prompt
8
+
9
+
10
+ # Load environment variables from .env file
11
+ load_dotenv()
12
+
13
+ api_key = os.getenv("OPENAI_API_KEY")
14
+ openai_client = OpenAI(api_key=api_key)
15
+
16
+
17
+ def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
18
+ """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
19
+ return call_nlp_service({"text": text}, "preprocess_text_with_nlp_pymupdf")
20
+
21
+
22
+ def create_prompt_with_nlp(chunk, preprocessed_data):
23
+ return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
24
+
25
+
26
+ def classify_changes_with_nlp(text_content, location_info):
27
+ """Classify changes with NLP preprocessing."""
28
+ # Apply NLP preprocessing
29
+ preprocessed_data = preprocess_text_with_nlp(text_content)
30
+
31
+ # Split into chunks (using the same method as your first experiment)
32
+ result = call_nlp_service({"text": text_content}, "recursive_character_text_splitter")
33
+ chunks = result["chunks"]
34
+
35
+ results = []
36
+ for chunk in chunks:
37
+ response = openai_client.chat.completions.create(
38
+ model="gpt-4o-mini",
39
+ messages=[
40
+ {
41
+ "role": "system",
42
+ "content": "You are a legal expert analyzing German regulatory changes. Return only JSON.",
43
+ },
44
+ {
45
+ "role": "user",
46
+ "content": create_prompt_with_nlp(chunk, preprocessed_data),
47
+ },
48
+ ],
49
+ temperature=0.7,
50
+ max_tokens=1024,
51
+ )
52
+
53
+ try:
54
+ result = json.loads(response.choices[0].message.content)
55
+ if result.get("changes_detected", False):
56
+ result["location"] = location_info
57
+ result["source_text"] = chunk
58
+ results.append(result)
59
+ except json.JSONDecodeError:
60
+ continue
61
+
62
+ return results if results else None
63
+
64
+
65
+ def extract_hierarchical_text(block):
66
+ """Extract text from a block including its parent and grandparent contexts."""
67
+ text_parts = []
68
+
69
+ # Check if block has a grandparent
70
+ if (
71
+ "parent" in block
72
+ and block["parent"] is not None
73
+ and "parent" in block["parent"]
74
+ and block["parent"]["parent"] is not None
75
+ ):
76
+ text_parts.append(block["parent"]["parent"]["text"])
77
+
78
+ # Check if block has a parent
79
+ if "parent" in block and block["parent"] is not None:
80
+ text_parts.append(block["parent"]["text"])
81
+
82
+ # Add the current block's text
83
+ text_parts.append(block["text"])
84
+
85
+ # Join all text parts with newlines between them
86
+ return "\n\n".join(text_parts)
87
+
88
+
89
+ def traverse_blocks_with_nlp(blocks, parent=None, results=None, is_top_level=True):
90
+ """Traverse hierarchy with NLP-enhanced analysis."""
91
+ if results is None:
92
+ results = []
93
+
94
+ iterable = (
95
+ tqdm(blocks, desc="Processing Text blocks with NLP") if is_top_level else blocks
96
+ )
97
+
98
+ for block in iterable:
99
+ block["parent"] = parent
100
+
101
+ if "children" in block and not block["children"]: # Leaf node
102
+ text_content = extract_hierarchical_text(block)
103
+ location_info = {
104
+ "page_number": block["page_number"],
105
+ "block_text": block["text"],
106
+ }
107
+
108
+ changes = classify_changes_with_nlp(text_content, location_info)
109
+ if changes:
110
+ for change in changes:
111
+ change["full_text"] = text_content
112
+ results.append(change)
113
+ else:
114
+ traverse_blocks_with_nlp(
115
+ block["children"], block, results, is_top_level=False
116
+ )
117
+
118
+ return results
119
+
120
+
121
+ def pymupdf_regulatory_change_detector_with_nlp_insights(hierarchical_structure):
122
+ """Main function with NLP integration."""
123
+ if not hierarchical_structure:
124
+ return {"error": "No structure provided"}, []
125
+
126
+ analysis_summary = {
127
+ "total_changes_detected": 0,
128
+ "changes_by_type": {"addition": 0, "deletion": 0, "modification": 0},
129
+ }
130
+ changes_by_page = {}
131
+
132
+ results = traverse_blocks_with_nlp(hierarchical_structure["blocks"])
133
+
134
+ for change in results:
135
+ analysis_summary["total_changes_detected"] += len(change["classifications"])
136
+ for classification in change["classifications"]:
137
+ analysis_summary["changes_by_type"][classification["change_type"]] += 1
138
+
139
+ change_subtype = (
140
+ "context" if classification["change"] in CONTEXT_CATEGORIES else "scope"
141
+ )
142
+ page_num = change["location"]["page_number"]
143
+ changes_by_page.setdefault(page_num, []).append(
144
+ {
145
+ "change": classification["change"],
146
+ "change_type": classification["change_type"],
147
+ "change_subtype": change_subtype,
148
+ "relevant_text": classification["relevant_text"],
149
+ "explanation": classification["explanation"],
150
+ "nlp_evidence": classification["evidence"],
151
+ }
152
+ )
153
+
154
+ return {
155
+ "analysis_summary": analysis_summary,
156
+ "changes_by_page": changes_by_page,
157
+ }, results
scripts/pymupdf_no_nlp_preprocessing.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from openai import OpenAI
5
+ from tqdm import tqdm
6
+ from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
7
+ from scripts.utility_functions import render_prompt
8
+ from scripts.pymupdf_nlp_preprocessing import extract_hierarchical_text
9
+
10
+
11
+ # Load environment variables from .env file
12
+ load_dotenv()
13
+
14
+ #nlp = spacy.load("de_core_news_sm")
15
+ api_key = os.getenv("OPENAI_API_KEY")
16
+ openai_client = OpenAI(api_key=api_key)
17
+
18
+
19
+ def create_prompt_without_nlp_insights(text):
20
+ return render_prompt(text, include_nlp=False)
21
+
22
+
23
+ def classify_changes_without_nlp_insights(text_content, location_info):
24
+ """Classify changes in text chunks using OpenAI."""
25
+
26
+ response = openai_client.chat.completions.create(
27
+ model="gpt-4o-mini",
28
+ messages=[
29
+ {
30
+ "role": "system",
31
+ "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
32
+ },
33
+ {
34
+ "role": "user",
35
+ "content": create_prompt_without_nlp_insights(text_content),
36
+ },
37
+ ],
38
+ temperature=0.7,
39
+ max_tokens=1024,
40
+ )
41
+
42
+ try:
43
+ result = json.loads(response.choices[0].message.content)
44
+ if result.get("changes_detected", False):
45
+ result["location"] = location_info
46
+ result["source_text"] = text_content
47
+ return result
48
+ return None
49
+ except json.JSONDecodeError:
50
+ return None
51
+
52
+
53
+ def traverse_blocks(
54
+ blocks, parent=None, grandparent=None, results=None, is_top_level=True
55
+ ):
56
+ """Traverse the hierarchical structure in a depth-first manner and analyze leaf nodes."""
57
+ if results is None:
58
+ results = []
59
+ iterable = (
60
+ tqdm(blocks, desc="Processing Text blocks with NLP") if is_top_level else blocks
61
+ )
62
+
63
+ for block in iterable:
64
+ # Add parent and grandparent references to the block for context tracking
65
+ block["parent"] = parent
66
+
67
+ if "children" in block and (
68
+ not block["children"] or len(block["children"]) == 0
69
+ ): # This is a leaf node
70
+ # Extract hierarchical text
71
+ text_content = extract_hierarchical_text(block)
72
+
73
+ # Define location info
74
+ location_info = {
75
+ "page_number": block["page_number"],
76
+ "block_text": block["text"],
77
+ }
78
+
79
+ # Analyze the text for changes
80
+ changes = classify_changes_without_nlp_insights(text_content, location_info)
81
+ if changes:
82
+ # Add the full hierarchical text to the result
83
+ changes["text"] = text_content
84
+ results.append(changes)
85
+ else:
86
+ traverse_blocks(
87
+ block["children"], block, parent, results, is_top_level=False
88
+ )
89
+
90
+ return results
91
+
92
+
93
+ def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structure):
94
+ """Main function to detect regulatory changes in the hierarchical structure."""
95
+ if not hierarchical_structure:
96
+ return {"error": "No hierarchical structure provided"}
97
+
98
+ analysis_summary = {
99
+ "total_changes_detected": 0,
100
+ "changes_by_type": {"addition": 0, "deletion": 0, "modification": 0},
101
+ }
102
+ changes_by_page = {}
103
+
104
+ # Traverse the blocks and analyze leaf nodes
105
+ results = traverse_blocks(hierarchical_structure["blocks"])
106
+
107
+ # Update analysis summary
108
+ for change in results:
109
+ analysis_summary["total_changes_detected"] += len(change["classifications"])
110
+
111
+ for classification in change["classifications"]:
112
+ change_type = classification["change_type"]
113
+ analysis_summary["changes_by_type"][change_type] += 1
114
+
115
+ # Group changes by page number
116
+ page_number = change["location"]["page_number"]
117
+ if page_number not in changes_by_page:
118
+ changes_by_page[page_number] = []
119
+
120
+ change_subtype = (
121
+ "context" if classification["change"] in CONTEXT_CATEGORIES else "scope"
122
+ )
123
+ changes_by_page[page_number].append(
124
+ {
125
+ "change": classification["change"],
126
+ "change_type": classification["change_type"],
127
+ "change_subtype": change_subtype,
128
+ "relevant_text": classification["relevant_text"],
129
+ "text": change["text"],
130
+ "explanation": classification["explanation"],
131
+ }
132
+ )
133
+
134
+ # Combine analysis summary and grouped changes
135
+ final_output = {
136
+ "analysis_summary": analysis_summary,
137
+ "changes_by_page": changes_by_page,
138
+ }
139
+
140
+ return final_output, results
scripts/text_extraction_landing_ai.py CHANGED
@@ -2,27 +2,22 @@ import os
2
  import json
3
  import glob
4
  from agentic_doc.parse import parse
5
- from streamlit.runtime.uploaded_file_manager import UploadedFile
6
 
 
 
7
 
8
- def extract_document(
9
- uploaded_document: UploadedFile, extraction_dir="text_extractions/"
10
- ):
11
- """
12
- Extract text from documents if not already extracted.
13
-
14
- Args:
15
- uploaded_document: UploadedFile: The document to extract text from.
16
- extraction_dir (str): Directory to store/check for extracted result
17
 
18
- Returns:
19
- dict: the json which we get from landing ai api
20
- """
 
 
 
21
  # Ensure extraction directory exists
22
  os.makedirs(extraction_dir, exist_ok=True)
23
 
24
  # Get the base document name (without extension)
25
- document_name = os.path.splitext(uploaded_document.name)[0]
26
 
27
  # Pattern to match existing extractions (e.g., "documentABC_*.json")
28
  existing_extraction_pattern = os.path.join(
@@ -39,9 +34,55 @@ def extract_document(
39
  else:
40
  try:
41
  print(f"No existing extraction found for {document_name}, calling API...")
42
- result = parse(uploaded_document.read())
43
  print(f"Successfully extracted {document_name}")
44
  except Exception as e:
45
  print(f"Error extracting {document_name}: {str(e)}")
46
  result = {"status": "error", "error": str(e)}
47
- return json.loads(result[0].model_dump_json())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import json
3
  import glob
4
  from agentic_doc.parse import parse
 
5
 
6
+ from scripts.pymupdf_nlp_preprocessing import classify_changes_with_nlp
7
+ from scripts.pymupdf_no_nlp_preprocessing import classify_changes_without_nlp_insights
8
 
 
 
 
 
 
 
 
 
 
9
 
10
+ def extract_document_agentic(
11
+ uploaded_document_name: str,
12
+ uploaded_document_bytes: bytes,
13
+ do_nlp_preprocessing=True,
14
+ extraction_dir="text_extractions/",
15
+ ):
16
  # Ensure extraction directory exists
17
  os.makedirs(extraction_dir, exist_ok=True)
18
 
19
  # Get the base document name (without extension)
20
+ document_name = os.path.splitext(uploaded_document_name)[0]
21
 
22
  # Pattern to match existing extractions (e.g., "documentABC_*.json")
23
  existing_extraction_pattern = os.path.join(
 
34
  else:
35
  try:
36
  print(f"No existing extraction found for {document_name}, calling API...")
37
+ result = json.loads(parse(uploaded_document_bytes)[0].model_dump_json())
38
  print(f"Successfully extracted {document_name}")
39
  except Exception as e:
40
  print(f"Error extracting {document_name}: {str(e)}")
41
  result = {"status": "error", "error": str(e)}
42
+ return result
43
+ if result:
44
+ if "chunks" in result and isinstance(result["chunks"], list):
45
+ for chunk in result["chunks"]:
46
+ if do_nlp_preprocessing:
47
+ classification_result = classify_changes_with_nlp(chunk["text"], "")
48
+ # flatten into a single json element so it matches non-nlp part
49
+ if classification_result and len(classification_result) > 0:
50
+ flattened_classifications = {"changes_detected": classification_result[0].get("changes_detected", False), "classifications": []}
51
+ for class_res in classification_result:
52
+ if class_res.get("changes_detected", False):
53
+ flattened_classifications["classifications"].extend(class_res.get("classifications", []))
54
+ classification_result = flattened_classifications
55
+ else:
56
+ classification_result = classify_changes_without_nlp_insights(
57
+ chunk["text"], ""
58
+ )
59
+ if classification_result and classification_result.get(
60
+ "changes_detected", False
61
+ ):
62
+ subchunks = []
63
+ for subchunk in classification_result.get(
64
+ "classifications", []
65
+ ):
66
+ subchunks.append(
67
+ {
68
+ "text": subchunk.get("relevant_text", ""),
69
+ "validated": False,
70
+ "confirmed": False,
71
+ "category": subchunk.get("change", ""),
72
+ "type": subchunk.get("change_type", ""),
73
+ "context": subchunk.get("explanation", ""),
74
+ }
75
+ )
76
+ chunk["subchunks"] = subchunks
77
+ else:
78
+ result["chunks"].remove(chunk)
79
+ # Create flattened list of subchunks for UI compatibility
80
+ flattened_changes = []
81
+ for chunk in result["chunks"]:
82
+ if "subchunks" in chunk:
83
+ for subchunk in chunk["subchunks"]:
84
+ subchunk["grounding"] = chunk["grounding"]
85
+ subchunk["grounding"][0]["line"] = -1
86
+ subchunk["chunk_id"] = chunk["chunk_id"]
87
+ flattened_changes.append(subchunk)
88
+ return flattened_changes, result.get("markdown", "")
scripts/utility_functions.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import json
4
  import re
5
  from rapidfuzz import fuzz
 
6
  from scripts.regulatory_change_foundation import (
7
  CLASSIFICATION_INFO,
8
  FEW_SHOT_EXAMPLES,
@@ -88,7 +89,7 @@ def highlight_nth(text, change, skip_failed=False):
88
 
89
  # TODO:check treshhold->51 would get always a result
90
  # if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing
91
- def highlight_fuzzy_match(text, change, n=0, threshold=86, skip_failed=False):
92
  target = change["text"]
93
  window_size = len(target)
94
  step = 1
@@ -123,6 +124,31 @@ def highlight_fuzzy_match(text, change, n=0, threshold=86, skip_failed=False):
123
  return text[:start_norm] + highlighted_span + text[end_norm:]
124
 
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def render_prompt(text, include_nlp=False, preprocessed_data=None):
127
  classification_json = json.dumps(CLASSIFICATION_INFO, indent=2)
128
  few_shot_json = json.dumps(FEW_SHOT_EXAMPLES, indent=2)
@@ -170,3 +196,14 @@ def save_json_to_file(data, output_dir, output_file):
170
 
171
  # Print the location of the saved file
172
  print(f"JSON data saved successfully at: {file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
3
  import json
4
  import re
5
  from rapidfuzz import fuzz
6
+ import requests
7
  from scripts.regulatory_change_foundation import (
8
  CLASSIFICATION_INFO,
9
  FEW_SHOT_EXAMPLES,
 
89
 
90
  # TODO:check treshhold->51 would get always a result
91
  # if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing
92
+ def highlight_fuzzy_match(text, change, n=0, threshold=80, skip_failed=False):
93
  target = change["text"]
94
  window_size = len(target)
95
  step = 1
 
124
  return text[:start_norm] + highlighted_span + text[end_norm:]
125
 
126
 
127
+ # TODO:check treshhold->51 would get always a result
128
+ # if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing
129
+ def get_best_fuzzy_match(text, change, threshold=65):
130
+ """Find the best fuzzy match for a change in the text and return the matched section"""
131
+ n = change.get("occurrence_index", 0)
132
+ target = change["text"]
133
+ window_size = len(target)
134
+ step = 1
135
+
136
+ candidates = []
137
+ for i in range(0, len(text) - window_size, step):
138
+ window = text[i : i + window_size]
139
+ score = fuzz.partial_ratio(window.lower(), target.lower())
140
+ if score >= threshold:
141
+ candidates.append((score, i, i + window_size))
142
+
143
+ if not candidates:
144
+ return None
145
+ # Pick top-N match
146
+ candidates.sort(reverse=True)
147
+ _, start_norm, end_norm = candidates[min(n, len(candidates) - 1)]
148
+
149
+ return text[start_norm:end_norm]
150
+
151
+
152
  def render_prompt(text, include_nlp=False, preprocessed_data=None):
153
  classification_json = json.dumps(CLASSIFICATION_INFO, indent=2)
154
  few_shot_json = json.dumps(FEW_SHOT_EXAMPLES, indent=2)
 
196
 
197
  # Print the location of the saved file
198
  print(f"JSON data saved successfully at: {file_path}")
199
+
200
+
201
+ def call_nlp_service(payload, method):
202
+ url = f"https://amougou-fortiss-nlp-preprocessor.hf.space/{method}"
203
+
204
+ # Make the request
205
+ response = requests.post(url, data=payload)
206
+ if response.status_code == 200:
207
+ return response.json()
208
+ else:
209
+ raise Exception(f"NLP service error: {response.status_code} - {response.text}")