Maximilian Amougou commited on
Commit
d6b760c
·
verified ·
1 Parent(s): 8c1316d

Upload 6 files

Browse files
scripts/llm_nlp_preprocessing.py CHANGED
@@ -1,7 +1,8 @@
1
  import json
2
  import os
 
3
  from dotenv import load_dotenv
4
- from openai import OpenAI
5
  from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
6
  from scripts.utility_functions import call_nlp_service, render_prompt
7
 
@@ -10,7 +11,7 @@ from scripts.utility_functions import call_nlp_service, render_prompt
10
  load_dotenv()
11
 
12
  api_key = os.getenv("OPENAI_API_KEY")
13
- openai_client = OpenAI(api_key=api_key)
14
 
15
 
16
  def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
@@ -22,36 +23,35 @@ def create_prompt(chunk, preprocessed_data):
22
  return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
23
 
24
 
25
- def search_for_regulatory_changes(chunks, preprocessed_data, subtitle):
26
- results = []
27
-
28
- for chunk in chunks:
29
- response = openai_client.chat.completions.create(
30
- model="gpt-4o-mini",
31
- messages=[
32
- {
33
- "role": "system",
34
- "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
35
- },
36
- {"role": "user", "content": create_prompt(chunk, preprocessed_data)},
37
- ],
38
- temperature=0.7,
39
- max_tokens=1024,
40
- )
41
-
42
  try:
 
 
 
 
 
 
 
 
 
 
 
 
43
  result = json.loads(response.choices[0].message.content)
44
  if result.get("changes_detected", False):
45
- result["location"] = {"subtitle": subtitle} # Use subtitle as location
46
  result["source_text"] = chunk
47
- results.append(result)
48
- except json.JSONDecodeError:
49
- continue
50
-
51
- return results
 
 
52
 
53
 
54
- def detect_regulatory_changes(text_content, subtitle):
55
  """
56
  Main function to detect regulatory changes from text content.
57
 
@@ -67,7 +67,7 @@ def detect_regulatory_changes(text_content, subtitle):
67
  chunks, preprocessed_data = preprocess_text_with_nlp(text_content)
68
 
69
  # Classify changes using NLP insights
70
- results = search_for_regulatory_changes(chunks, preprocessed_data, subtitle)
71
 
72
  return results
73
 
@@ -80,21 +80,29 @@ def llm_regulatory_change_detector(hierarchical_structure, progress_callback=Non
80
  }
81
  subtitles = {}
82
 
83
- # Iterate over sections and analyze content
84
- total_sections = len(hierarchical_structure["sections"])
85
- for idx, section in enumerate(hierarchical_structure["sections"]):
86
- # Update progress if callback provided
87
- if progress_callback:
88
- progress_callback((idx + 1) / total_sections)
 
 
 
 
 
89
  if status_callback:
90
- status_callback(f"Analyzing section {idx + 1}/{total_sections}: {section.get('subtitle', 'Untitled')}")
91
- subtitle = section["subtitle"]
92
- content = section["content"]
93
- if isinstance(content, list):
94
- content = "\n".join(content)
95
-
96
- # Detect changes for this subtitle
97
- changes = detect_regulatory_changes(content, subtitle)
 
 
 
98
 
99
  # Update analysis summary
100
  for change in changes:
 
1
  import json
2
  import os
3
+ import asyncio
4
  from dotenv import load_dotenv
5
+ from openai import AsyncOpenAI
6
  from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
7
  from scripts.utility_functions import call_nlp_service, render_prompt
8
 
 
11
  load_dotenv()
12
 
13
  api_key = os.getenv("OPENAI_API_KEY")
14
+ openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
15
 
16
 
17
  def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
 
23
  return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
24
 
25
 
26
+ async def search_for_regulatory_changes(chunks, preprocessed_data, subtitle):
27
+ async def process_chunk(chunk):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  try:
29
+ response = await openai_client.chat.completions.create(
30
+ model="gpt-4o-mini",
31
+ messages=[
32
+ {
33
+ "role": "system",
34
+ "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
35
+ },
36
+ {"role": "user", "content": create_prompt(chunk, preprocessed_data)},
37
+ ],
38
+ temperature=0.7,
39
+ max_tokens=1024,
40
+ )
41
  result = json.loads(response.choices[0].message.content)
42
  if result.get("changes_detected", False):
43
+ result["location"] = {"subtitle": subtitle}
44
  result["source_text"] = chunk
45
+ return result
46
+ except (json.JSONDecodeError, Exception):
47
+ return None
48
+
49
+ tasks = [process_chunk(chunk) for chunk in chunks]
50
+ results = await asyncio.gather(*tasks)
51
+ return [r for r in results if r is not None]
52
 
53
 
54
+ async def detect_regulatory_changes(text_content, subtitle):
55
  """
56
  Main function to detect regulatory changes from text content.
57
 
 
67
  chunks, preprocessed_data = preprocess_text_with_nlp(text_content)
68
 
69
  # Classify changes using NLP insights
70
+ results = await search_for_regulatory_changes(chunks, preprocessed_data, subtitle)
71
 
72
  return results
73
 
 
80
  }
81
  subtitles = {}
82
 
83
+ async def process_all_sections():
84
+ async def process_section(section):
85
+ subtitle = section["subtitle"]
86
+ content = section["content"]
87
+ if isinstance(content, list):
88
+ content = "\n".join(content)
89
+
90
+ # Detect changes for this subtitle
91
+ changes = await detect_regulatory_changes(content, subtitle)
92
+ return subtitle, changes
93
+
94
  if status_callback:
95
+ status_callback(f"Processing all {len(hierarchical_structure['sections'])} sections concurrently...")
96
+
97
+ tasks = [process_section(section) for section in hierarchical_structure["sections"]]
98
+ results = await asyncio.gather(*tasks)
99
+ return results
100
+
101
+ # Run async processing
102
+ section_results = asyncio.run(process_all_sections())
103
+
104
+ # Process results
105
+ for subtitle, changes in section_results:
106
 
107
  # Update analysis summary
108
  for change in changes:
scripts/llm_no_nlp_preprocessing.py CHANGED
@@ -1,7 +1,8 @@
1
  import json
2
  import os
 
3
  from dotenv import load_dotenv
4
- from openai import OpenAI
5
  from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
6
  from scripts.utility_functions import render_prompt
7
 
@@ -10,43 +11,47 @@ from scripts.utility_functions import render_prompt
10
  load_dotenv()
11
 
12
  api_key = os.getenv("OPENAI_API_KEY")
13
- openai_client = OpenAI(api_key=api_key)
14
 
15
 
16
  def create_prompt_without_nlp_insights(text):
17
  return render_prompt(text, include_nlp=False)
18
 
19
 
20
- def classify_changes_without_nlp_insights(text_content, subtitle):
21
  """Classify changes in text chunks using OpenAI."""
22
 
23
  chunks = text_content.split("\n\n")
24
- results = []
25
-
26
- for chunk in chunks:
27
- response = openai_client.chat.completions.create(
28
- model="gpt-4o-mini",
29
- messages=[
30
- {
31
- "role": "system",
32
- "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
33
- },
34
- {"role": "user", "content": create_prompt_without_nlp_insights(chunk)},
35
- ],
36
- temperature=0.7,
37
- max_tokens=1024,
38
- )
39
-
40
  try:
 
 
 
 
 
 
 
 
 
 
 
 
41
  result = json.loads(response.choices[0].message.content)
42
  if result.get("changes_detected", False):
43
- result["location"] = {"subtitle": subtitle} # Use subtitle as location
44
  result["source_text"] = chunk
45
- results.append(result)
46
- except json.JSONDecodeError:
47
- continue
 
 
 
 
48
 
49
- return results
 
 
50
 
51
 
52
  def llm_regulatory_change_detector_without_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
@@ -57,21 +62,29 @@ def llm_regulatory_change_detector_without_nlp_insights(hierarchical_structure,
57
  }
58
  subtitles = {}
59
 
60
- # Iterate over sections and analyze content
61
- total_sections = len(hierarchical_structure["sections"])
62
- for idx, section in enumerate(hierarchical_structure["sections"]):
63
- # Update progress if callback provided
64
- if progress_callback:
65
- progress_callback((idx + 1) / total_sections)
 
 
 
 
 
66
  if status_callback:
67
- status_callback(f"Analyzing section {idx + 1}/{total_sections}: {section.get('subtitle', 'Untitled')}")
68
- subtitle = section["subtitle"]
69
- content = section["content"]
70
- if isinstance(content, list):
71
- content = "\n".join(content)
72
-
73
- # Detect changes for this subtitle
74
- changes = classify_changes_without_nlp_insights(content, subtitle)
 
 
 
75
 
76
  # Update analysis summary
77
  for change in changes:
 
1
  import json
2
  import os
3
+ import asyncio
4
  from dotenv import load_dotenv
5
+ from openai import AsyncOpenAI
6
  from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
7
  from scripts.utility_functions import render_prompt
8
 
 
11
  load_dotenv()
12
 
13
  api_key = os.getenv("OPENAI_API_KEY")
14
+ openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
15
 
16
 
17
  def create_prompt_without_nlp_insights(text):
18
  return render_prompt(text, include_nlp=False)
19
 
20
 
21
+ async def classify_changes_without_nlp_insights(text_content, subtitle):
22
  """Classify changes in text chunks using OpenAI."""
23
 
24
  chunks = text_content.split("\n\n")
25
+
26
+ async def process_chunk(chunk):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  try:
28
+ response = await openai_client.chat.completions.create(
29
+ model="gpt-4o-mini",
30
+ messages=[
31
+ {
32
+ "role": "system",
33
+ "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
34
+ },
35
+ {"role": "user", "content": create_prompt_without_nlp_insights(chunk)},
36
+ ],
37
+ temperature=0.7,
38
+ max_tokens=1024,
39
+ )
40
  result = json.loads(response.choices[0].message.content)
41
  if result.get("changes_detected", False):
42
+ result["location"] = {"subtitle": subtitle}
43
  result["source_text"] = chunk
44
+ return result
45
+ except (json.JSONDecodeError, Exception):
46
+ return None
47
+
48
+ tasks = [process_chunk(chunk) for chunk in chunks]
49
+ results = await asyncio.gather(*tasks)
50
+ return [r for r in results if r is not None]
51
 
52
+ # Async wrapper for backward compatibility
53
+ async def classify_changes_without_nlp_insights_async(text_content, subtitle):
54
+ return await classify_changes_without_nlp_insights(text_content, subtitle)
55
 
56
 
57
  def llm_regulatory_change_detector_without_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
 
62
  }
63
  subtitles = {}
64
 
65
+ async def process_all_sections():
66
+ async def process_section(section):
67
+ subtitle = section["subtitle"]
68
+ content = section["content"]
69
+ if isinstance(content, list):
70
+ content = "\n".join(content)
71
+
72
+ # Detect changes for this subtitle
73
+ changes = await classify_changes_without_nlp_insights(content, subtitle)
74
+ return subtitle, changes
75
+
76
  if status_callback:
77
+ status_callback(f"Processing all {len(hierarchical_structure['sections'])} sections concurrently...")
78
+
79
+ tasks = [process_section(section) for section in hierarchical_structure["sections"]]
80
+ results = await asyncio.gather(*tasks)
81
+ return results
82
+
83
+ # Run async processing
84
+ section_results = asyncio.run(process_all_sections())
85
+
86
+ # Process results
87
+ for subtitle, changes in section_results:
88
 
89
  # Update analysis summary
90
  for change in changes:
scripts/pymupdf_nlp_preprocessing.py CHANGED
@@ -1,7 +1,8 @@
1
  import json
2
  import os
 
3
  from dotenv import load_dotenv
4
- from openai import OpenAI
5
  from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
6
  from scripts.utility_functions import call_nlp_service, render_prompt
7
 
@@ -10,7 +11,7 @@ from scripts.utility_functions import call_nlp_service, render_prompt
10
  load_dotenv()
11
 
12
  api_key = os.getenv("OPENAI_API_KEY")
13
- openai_client = OpenAI(api_key=api_key)
14
 
15
 
16
  def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
@@ -22,7 +23,7 @@ def create_prompt_with_nlp(chunk, preprocessed_data):
22
  return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
23
 
24
 
25
- def classify_changes_with_nlp(text_content, location_info):
26
  """Classify changes with NLP preprocessing."""
27
  # Apply NLP preprocessing
28
  preprocessed_data = preprocess_text_with_nlp(text_content)
@@ -31,34 +32,39 @@ def classify_changes_with_nlp(text_content, location_info):
31
  result = call_nlp_service({"text": text_content}, "recursive_character_text_splitter")
32
  chunks = result["chunks"]
33
 
34
- results = []
35
- for chunk in chunks:
36
- response = openai_client.chat.completions.create(
37
- model="gpt-4o-mini",
38
- messages=[
39
- {
40
- "role": "system",
41
- "content": "You are a legal expert analyzing German regulatory changes. Return only JSON.",
42
- },
43
- {
44
- "role": "user",
45
- "content": create_prompt_with_nlp(chunk, preprocessed_data),
46
- },
47
- ],
48
- temperature=0.7,
49
- max_tokens=1024,
50
- )
51
-
52
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  result = json.loads(response.choices[0].message.content)
54
  if result.get("changes_detected", False):
55
  result["location"] = location_info
56
  result["source_text"] = chunk
57
- results.append(result)
58
- except json.JSONDecodeError:
59
- continue
 
 
 
 
 
60
 
61
- return results if results else None
 
 
62
 
63
 
64
  def extract_hierarchical_text(block):
@@ -85,38 +91,39 @@ def extract_hierarchical_text(block):
85
  return "\n\n".join(text_parts)
86
 
87
 
88
- def traverse_blocks_with_nlp(blocks, parent=None, results=None, is_top_level=True, progress_callback=None, status_callback=None):
89
- """Traverse hierarchy with NLP-enhanced analysis."""
90
- if results is None:
91
- results = []
92
-
93
- total_blocks = len(blocks) if is_top_level else 0
94
 
95
- for idx, block in enumerate(blocks):
96
- if is_top_level and progress_callback:
97
- progress_callback((idx + 1) / total_blocks)
98
- if is_top_level and status_callback:
99
- status_callback(f"Processing text block {idx + 1}/{total_blocks} with NLP")
100
  block["parent"] = parent
101
-
102
  if "children" in block and not block["children"]: # Leaf node
103
  text_content = extract_hierarchical_text(block)
104
  location_info = {
105
  "page_number": block["page_number"],
106
  "block_text": block["text"],
107
  }
108
-
109
- changes = classify_changes_with_nlp(text_content, location_info)
110
  if changes:
111
  for change in changes:
112
  change["full_text"] = text_content
113
- results.append(change)
114
  else:
115
- traverse_blocks_with_nlp(
116
- block["children"], block, results, is_top_level=False, progress_callback=progress_callback, status_callback=status_callback
117
- )
118
-
119
- return results
 
 
 
 
 
 
 
 
 
120
 
121
 
122
  def pymupdf_regulatory_change_detector_with_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
@@ -131,9 +138,10 @@ def pymupdf_regulatory_change_detector_with_nlp_insights(hierarchical_structure,
131
  changes_by_page = {}
132
 
133
  if status_callback:
134
- status_callback("Analyzing document structure with NLP...")
135
 
136
- results = traverse_blocks_with_nlp(hierarchical_structure["blocks"], progress_callback=progress_callback, status_callback=status_callback)
 
137
 
138
  for change in results:
139
  analysis_summary["total_changes_detected"] += len(change["classifications"])
 
1
  import json
2
  import os
3
+ import asyncio
4
  from dotenv import load_dotenv
5
+ from openai import AsyncOpenAI
6
  from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
7
  from scripts.utility_functions import call_nlp_service, render_prompt
8
 
 
11
  load_dotenv()
12
 
13
  api_key = os.getenv("OPENAI_API_KEY")
14
+ openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
15
 
16
 
17
  def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
 
23
  return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
24
 
25
 
26
+ async def classify_changes_with_nlp(text_content, location_info):
27
  """Classify changes with NLP preprocessing."""
28
  # Apply NLP preprocessing
29
  preprocessed_data = preprocess_text_with_nlp(text_content)
 
32
  result = call_nlp_service({"text": text_content}, "recursive_character_text_splitter")
33
  chunks = result["chunks"]
34
 
35
+ async def process_chunk(chunk):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  try:
37
+ response = await openai_client.chat.completions.create(
38
+ model="gpt-4o-mini",
39
+ messages=[
40
+ {
41
+ "role": "system",
42
+ "content": "You are a legal expert analyzing German regulatory changes. Return only JSON.",
43
+ },
44
+ {
45
+ "role": "user",
46
+ "content": create_prompt_with_nlp(chunk, preprocessed_data),
47
+ },
48
+ ],
49
+ temperature=0.7,
50
+ max_tokens=1024,
51
+ )
52
  result = json.loads(response.choices[0].message.content)
53
  if result.get("changes_detected", False):
54
  result["location"] = location_info
55
  result["source_text"] = chunk
56
+ return result
57
+ except (json.JSONDecodeError, Exception):
58
+ return None
59
+
60
+ tasks = [process_chunk(chunk) for chunk in chunks]
61
+ results = await asyncio.gather(*tasks)
62
+ filtered_results = [r for r in results if r is not None]
63
+ return filtered_results if filtered_results else None
64
 
65
+ # Async wrapper for backward compatibility
66
+ async def classify_changes_with_nlp_async(text_content, location_info):
67
+ return await classify_changes_with_nlp(text_content, location_info)
68
 
69
 
70
  def extract_hierarchical_text(block):
 
91
  return "\n\n".join(text_parts)
92
 
93
 
94
+ async def traverse_blocks_with_nlp(blocks, parent=None):
95
+ """Traverse hierarchy with NLP-enhanced analysis using asyncio.gather()."""
 
 
 
 
96
 
97
+ async def process_block(block, parent):
 
 
 
 
98
  block["parent"] = parent
99
+
100
  if "children" in block and not block["children"]: # Leaf node
101
  text_content = extract_hierarchical_text(block)
102
  location_info = {
103
  "page_number": block["page_number"],
104
  "block_text": block["text"],
105
  }
106
+
107
+ changes = await classify_changes_with_nlp(text_content, location_info)
108
  if changes:
109
  for change in changes:
110
  change["full_text"] = text_content
111
+ return changes
112
  else:
113
+ # Process children recursively
114
+ return await traverse_blocks_with_nlp(block["children"], block)
115
+ return []
116
+
117
+ # Process all blocks concurrently
118
+ tasks = [process_block(block, parent) for block in blocks]
119
+ results = await asyncio.gather(*tasks)
120
+
121
+ # Flatten results
122
+ flattened = []
123
+ for result in results:
124
+ if isinstance(result, list):
125
+ flattened.extend(result)
126
+ return flattened
127
 
128
 
129
  def pymupdf_regulatory_change_detector_with_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
 
138
  changes_by_page = {}
139
 
140
  if status_callback:
141
+ status_callback("Analyzing all document blocks concurrently with NLP...")
142
 
143
+ # Run async processing
144
+ results = asyncio.run(traverse_blocks_with_nlp(hierarchical_structure["blocks"]))
145
 
146
  for change in results:
147
  analysis_summary["total_changes_detected"] += len(change["classifications"])
scripts/pymupdf_no_nlp_preprocessing.py CHANGED
@@ -1,7 +1,8 @@
1
  import json
2
  import os
 
3
  from dotenv import load_dotenv
4
- from openai import OpenAI
5
  from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
6
  from scripts.utility_functions import render_prompt
7
  from scripts.pymupdf_nlp_preprocessing import extract_hierarchical_text
@@ -12,84 +13,83 @@ load_dotenv()
12
 
13
  #nlp = spacy.load("de_core_news_sm")
14
  api_key = os.getenv("OPENAI_API_KEY")
15
- openai_client = OpenAI(api_key=api_key)
16
 
17
 
18
  def create_prompt_without_nlp_insights(text):
19
  return render_prompt(text, include_nlp=False)
20
 
21
 
22
- def classify_changes_without_nlp_insights(text_content, location_info):
23
  """Classify changes in text chunks using OpenAI."""
24
 
25
- response = openai_client.chat.completions.create(
26
- model="gpt-4o-mini",
27
- messages=[
28
- {
29
- "role": "system",
30
- "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
31
- },
32
- {
33
- "role": "user",
34
- "content": create_prompt_without_nlp_insights(text_content),
35
- },
36
- ],
37
- temperature=0.7,
38
- max_tokens=1024,
39
- )
40
-
41
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  result = json.loads(response.choices[0].message.content)
43
  if result.get("changes_detected", False):
44
  result["location"] = location_info
45
  result["source_text"] = text_content
46
  return result
47
- return None
48
- except json.JSONDecodeError:
49
- return None
50
 
 
 
 
51
 
52
- def traverse_blocks(
53
- blocks, parent=None, grandparent=None, results=None, is_top_level=True, progress_callback=None, status_callback=None
54
- ):
55
- """Traverse the hierarchical structure in a depth-first manner and analyze leaf nodes."""
56
- if results is None:
57
- results = []
58
-
59
- total_blocks = len(blocks) if is_top_level else 0
60
 
61
- for idx, block in enumerate(blocks):
62
- if is_top_level and progress_callback:
63
- progress_callback((idx + 1) / total_blocks)
64
- if is_top_level and status_callback:
65
- status_callback(f"Processing text block {idx + 1}/{total_blocks}")
66
- # Add parent and grandparent references to the block for context tracking
67
  block["parent"] = parent
68
-
69
- if "children" in block and (
70
- not block["children"] or len(block["children"]) == 0
71
- ): # This is a leaf node
72
  # Extract hierarchical text
73
  text_content = extract_hierarchical_text(block)
74
-
75
  # Define location info
76
  location_info = {
77
  "page_number": block["page_number"],
78
  "block_text": block["text"],
79
  }
80
-
81
  # Analyze the text for changes
82
- changes = classify_changes_without_nlp_insights(text_content, location_info)
83
  if changes:
84
  # Add the full hierarchical text to the result
85
  changes["text"] = text_content
86
- results.append(changes)
87
  else:
88
- traverse_blocks(
89
- block["children"], block, parent, results, is_top_level=False, progress_callback=progress_callback, status_callback=status_callback
90
- )
91
-
92
- return results
 
 
 
 
 
 
 
 
 
93
 
94
 
95
  def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
@@ -104,10 +104,10 @@ def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structu
104
  changes_by_page = {}
105
 
106
  if status_callback:
107
- status_callback("Analyzing document structure...")
108
 
109
- # Traverse the blocks and analyze leaf nodes
110
- results = traverse_blocks(hierarchical_structure["blocks"], progress_callback=progress_callback, status_callback=status_callback)
111
 
112
  # Update analysis summary
113
  for change in results:
 
1
  import json
2
  import os
3
+ import asyncio
4
  from dotenv import load_dotenv
5
+ from openai import AsyncOpenAI
6
  from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
7
  from scripts.utility_functions import render_prompt
8
  from scripts.pymupdf_nlp_preprocessing import extract_hierarchical_text
 
13
 
14
  #nlp = spacy.load("de_core_news_sm")
15
  api_key = os.getenv("OPENAI_API_KEY")
16
+ openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
17
 
18
 
19
  def create_prompt_without_nlp_insights(text):
20
  return render_prompt(text, include_nlp=False)
21
 
22
 
23
+ async def classify_changes_without_nlp_insights(text_content, location_info):
24
  """Classify changes in text chunks using OpenAI."""
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  try:
27
+ response = await openai_client.chat.completions.create(
28
+ model="gpt-4o-mini",
29
+ messages=[
30
+ {
31
+ "role": "system",
32
+ "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
33
+ },
34
+ {
35
+ "role": "user",
36
+ "content": create_prompt_without_nlp_insights(text_content),
37
+ },
38
+ ],
39
+ temperature=0.7,
40
+ max_tokens=1024,
41
+ )
42
  result = json.loads(response.choices[0].message.content)
43
  if result.get("changes_detected", False):
44
  result["location"] = location_info
45
  result["source_text"] = text_content
46
  return result
47
+ except (json.JSONDecodeError, Exception):
48
+ pass
49
+ return None
50
 
51
+ # Async wrapper for backward compatibility
52
+ async def classify_changes_without_nlp_insights_async(text_content, location_info):
53
+ return await classify_changes_without_nlp_insights(text_content, location_info)
54
 
55
+
56
+ async def traverse_blocks(blocks, parent=None):
57
+ """Traverse the hierarchical structure and analyze leaf nodes using asyncio.gather()."""
 
 
 
 
 
58
 
59
+ async def process_block(block, parent):
 
 
 
 
 
60
  block["parent"] = parent
61
+
62
+ if "children" in block and (not block["children"] or len(block["children"]) == 0): # Leaf node
 
 
63
  # Extract hierarchical text
64
  text_content = extract_hierarchical_text(block)
65
+
66
  # Define location info
67
  location_info = {
68
  "page_number": block["page_number"],
69
  "block_text": block["text"],
70
  }
71
+
72
  # Analyze the text for changes
73
+ changes = await classify_changes_without_nlp_insights(text_content, location_info)
74
  if changes:
75
  # Add the full hierarchical text to the result
76
  changes["text"] = text_content
77
+ return [changes]
78
  else:
79
+ # Process children recursively
80
+ return await traverse_blocks(block["children"], block)
81
+ return []
82
+
83
+ # Process all blocks concurrently
84
+ tasks = [process_block(block, parent) for block in blocks]
85
+ results = await asyncio.gather(*tasks)
86
+
87
+ # Flatten results
88
+ flattened = []
89
+ for result in results:
90
+ if isinstance(result, list):
91
+ flattened.extend(result)
92
+ return flattened
93
 
94
 
95
  def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
 
104
  changes_by_page = {}
105
 
106
  if status_callback:
107
+ status_callback("Analyzing all document blocks concurrently...")
108
 
109
+ # Run async processing
110
+ results = asyncio.run(traverse_blocks(hierarchical_structure["blocks"]))
111
 
112
  # Update analysis summary
113
  for change in results:
scripts/text_extraction_landing_ai.py CHANGED
@@ -1,11 +1,12 @@
1
  import os
2
  import json
3
  import glob
 
4
  from agentic_doc.parse import parse
5
 
6
  from scripts.models import RegulatoryChange
7
- from scripts.pymupdf_nlp_preprocessing import classify_changes_with_nlp
8
- from scripts.pymupdf_no_nlp_preprocessing import classify_changes_without_nlp_insights
9
 
10
 
11
  def extract_document_agentic(
@@ -43,41 +44,51 @@ def extract_document_agentic(
43
  return result
44
  if result:
45
  if "chunks" in result and isinstance(result["chunks"], list):
46
- for chunk in result["chunks"]:
47
- if do_nlp_preprocessing:
48
- classification_result = classify_changes_with_nlp(chunk["text"], "")
49
- # flatten into a single json element so it matches non-nlp part
50
- if classification_result and len(classification_result) > 0:
51
- flattened_classifications = {"changes_detected": classification_result[0].get("changes_detected", False), "classifications": []}
52
- for class_res in classification_result:
53
- if class_res.get("changes_detected", False):
54
- flattened_classifications["classifications"].extend(class_res.get("classifications", []))
55
- classification_result = flattened_classifications
56
- else:
57
- classification_result = classify_changes_without_nlp_insights(
58
- chunk["text"], ""
59
- )
60
- if classification_result and classification_result.get(
61
- "changes_detected", False
62
- ):
63
- subchunks = []
64
- for subchunk in classification_result.get(
65
- "classifications", []
66
- ):
67
- subchunks.append(
68
- {
69
- "text": subchunk.get("relevant_text", ""),
70
- "validated": False,
71
- "confirmed": False,
72
- "reviewed": False,
73
- "category": subchunk.get("change", ""),
74
- "type": subchunk.get("change_type", ""),
75
- "context": subchunk.get("explanation", ""),
76
- }
77
  )
78
- chunk["subchunks"] = subchunks
79
- else:
80
- result["chunks"].remove(chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  # Create flattened list of subchunks for UI compatibility
82
  flattened_changes = []
83
  for chunk in result["chunks"]:
 
1
  import os
2
  import json
3
  import glob
4
+ import asyncio
5
  from agentic_doc.parse import parse
6
 
7
  from scripts.models import RegulatoryChange
8
+ from scripts.pymupdf_nlp_preprocessing import classify_changes_with_nlp_async
9
+ from scripts.pymupdf_no_nlp_preprocessing import classify_changes_without_nlp_insights_async
10
 
11
 
12
  def extract_document_agentic(
 
44
  return result
45
  if result:
46
  if "chunks" in result and isinstance(result["chunks"], list):
47
+ # Process all chunks concurrently with asyncio.gather()
48
+ async def process_all_chunks():
49
+ async def process_chunk(chunk):
50
+ if do_nlp_preprocessing:
51
+ classification_result = await classify_changes_with_nlp_async(chunk["text"], "")
52
+ # flatten into a single json element so it matches non-nlp part
53
+ if classification_result and len(classification_result) > 0:
54
+ flattened_classifications = {"changes_detected": classification_result[0].get("changes_detected", False), "classifications": []}
55
+ for class_res in classification_result:
56
+ if class_res.get("changes_detected", False):
57
+ flattened_classifications["classifications"].extend(class_res.get("classifications", []))
58
+ classification_result = flattened_classifications
59
+ else:
60
+ classification_result = await classify_changes_without_nlp_insights_async(
61
+ chunk["text"], ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  )
63
+
64
+ if classification_result and classification_result.get("changes_detected", False):
65
+ subchunks = []
66
+ for subchunk in classification_result.get("classifications", []):
67
+ subchunks.append(
68
+ {
69
+ "text": subchunk.get("relevant_text", ""),
70
+ "validated": False,
71
+ "confirmed": False,
72
+ "reviewed": False,
73
+ "category": subchunk.get("change", ""),
74
+ "type": subchunk.get("change_type", ""),
75
+ "context": subchunk.get("explanation", ""),
76
+ }
77
+ )
78
+ chunk["subchunks"] = subchunks
79
+ return chunk, True
80
+ return chunk, False
81
+
82
+ # Process all chunks concurrently
83
+ tasks = [process_chunk(chunk) for chunk in result["chunks"]]
84
+ results = await asyncio.gather(*tasks)
85
+ return results
86
+
87
+ # Run async processing
88
+ processed_results = asyncio.run(process_all_chunks())
89
+
90
+ # Remove chunks without changes
91
+ result["chunks"] = [chunk for chunk, has_changes in processed_results if has_changes]
92
  # Create flattened list of subchunks for UI compatibility
93
  flattened_changes = []
94
  for chunk in result["chunks"]:
scripts/utility_functions.py CHANGED
@@ -288,6 +288,31 @@ def remove_html_comments(text: str) -> str:
288
  return clean_text
289
 
290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  def highlight_differences_words(text1: str, text2: str):
292
  """
293
  Return two HTML strings: highlighted version of text1 and text2.
@@ -352,6 +377,8 @@ def map_categorical_impact_assessment(
352
  changes: list[RegulatoryChange],
353
  ) -> list[RegulatoryChange]:
354
  """Map categorical impact assessment actions based on changetype"""
 
 
355
  action_map = {
356
  "Textual and Editorial Changes": {
357
  "actions": [
@@ -397,7 +424,8 @@ def map_categorical_impact_assessment(
397
  expected_labels = [action["label"] for action in mapped_actions]
398
 
399
  # Only update if the labels don't match
 
400
  if current_labels != expected_labels:
401
- change.actions = mapped_actions
402
  # If labels match but user has different completion status, preserve their progress
403
  return changes
 
288
  return clean_text
289
 
290
 
291
+ def normalize_markdown_indentation(content):
292
+ """Normalize excessive indentation to prevent code block interpretation."""
293
+ lines = content.split("\n")
294
+ normalized_lines = []
295
+
296
+ for line in lines:
297
+ # Check if line is a list item with excessive indentation
298
+ stripped = line.lstrip()
299
+ if stripped.startswith(("-", "*", "+")):
300
+ # Count leading spaces
301
+ leading_spaces = len(line) - len(stripped)
302
+ # Normalize to max 4 spaces for nested lists
303
+ if leading_spaces > 4:
304
+ # Convert to proper nested list (2 spaces per level)
305
+ nest_level = min(leading_spaces // 6, 2) # Max 2 levels deep
306
+ normalized_line = " " * nest_level + stripped
307
+ normalized_lines.append(normalized_line)
308
+ else:
309
+ normalized_lines.append(line)
310
+ else:
311
+ normalized_lines.append(line)
312
+
313
+ return "\n".join(normalized_lines)
314
+
315
+
316
  def highlight_differences_words(text1: str, text2: str):
317
  """
318
  Return two HTML strings: highlighted version of text1 and text2.
 
377
  changes: list[RegulatoryChange],
378
  ) -> list[RegulatoryChange]:
379
  """Map categorical impact assessment actions based on changetype"""
380
+ import copy
381
+
382
  action_map = {
383
  "Textual and Editorial Changes": {
384
  "actions": [
 
424
  expected_labels = [action["label"] for action in mapped_actions]
425
 
426
  # Only update if the labels don't match
427
+ # Create deep copies to prevent shared references across changes
428
  if current_labels != expected_labels:
429
+ change.actions = copy.deepcopy(mapped_actions)
430
  # If labels match but user has different completion status, preserve their progress
431
  return changes