Maximilian Amougou commited on
Commit
09a324c
·
verified ·
1 Parent(s): a60b2bb

Upload 7 files

Browse files
scripts/agentic_pdfeditor.py CHANGED
@@ -100,10 +100,18 @@ def agentic_pdf_annotator(changes: list[RegulatoryChange], file_bytes, extractio
100
  # Sort by length of relevant_text in descending order to avoid overlapping highlights
101
  changes = sorted(changes, key=lambda c: -len(c.text))
102
  annotated_areas = {}
 
 
 
 
103
  full_text = ""
104
  for page_num in range(len(doc)):
105
  page = doc[page_num]
106
- full_text += page.get_text()
 
 
 
 
107
  for change in changes:
108
  page_num = int(change.grounding[0].page)
109
  text = change.text
@@ -114,7 +122,7 @@ def agentic_pdf_annotator(changes: list[RegulatoryChange], file_bytes, extractio
114
  results = []
115
  for pnr in range(len(doc)): # search all pages
116
  annotated_areas.setdefault(f"{pnr}", [])
117
- page = doc.load_page(pnr)
118
  text_instances = page.search_for(text)
119
  for inst in text_instances:
120
  page_num = pnr# remove?
@@ -133,7 +141,7 @@ def agentic_pdf_annotator(changes: list[RegulatoryChange], file_bytes, extractio
133
  if best_match and len(best_match) > 0:
134
  print("found best fuzzy match: ", best_match)
135
  for page_num in range(len(doc)): # search all pages
136
- page = doc.load_page(page_num)
137
  text_instances = page.search_for(best_match)
138
  for inst in text_instances:
139
  results.append({"page": page_num, "bbox": inst})
@@ -149,10 +157,10 @@ def agentic_pdf_annotator(changes: list[RegulatoryChange], file_bytes, extractio
149
  )
150
  if results: # "flattenning" the results
151
  page_num = results[0]["page"]
152
- doc_page = doc.load_page(page_num)
153
  results = [r["bbox"] for r in results if r["page"] == page_num]
154
  else:
155
- doc_page = doc.load_page(page_num)
156
  annotated_areas.setdefault(f"{page_num}", [])
157
  # Search for the relevant text on the page
158
  results = doc_page.search_for(text)
@@ -168,7 +176,7 @@ def agentic_pdf_annotator(changes: list[RegulatoryChange], file_bytes, extractio
168
  )
169
  if not results:
170
  best_match = get_best_fuzzy_match(
171
- doc_page.get_text(option="text"), change
172
  )
173
  if best_match and len(best_match) > 0:
174
  results = doc_page.search_for(best_match)
 
100
  # Sort by length of relevant_text in descending order to avoid overlapping highlights
101
  changes = sorted(changes, key=lambda c: -len(c.text))
102
  annotated_areas = {}
103
+
104
+ # OPTIMIZATION: Pre-cache all pages and their text content
105
+ page_cache = {}
106
+ page_text_cache = {}
107
  full_text = ""
108
  for page_num in range(len(doc)):
109
  page = doc[page_num]
110
+ page_cache[page_num] = page
111
+ page_text = page.get_text()
112
+ page_text_cache[page_num] = page_text
113
+ full_text += page_text
114
+
115
  for change in changes:
116
  page_num = int(change.grounding[0].page)
117
  text = change.text
 
122
  results = []
123
  for pnr in range(len(doc)): # search all pages
124
  annotated_areas.setdefault(f"{pnr}", [])
125
+ page = page_cache[pnr] # Use cached page
126
  text_instances = page.search_for(text)
127
  for inst in text_instances:
128
  page_num = pnr# remove?
 
141
  if best_match and len(best_match) > 0:
142
  print("found best fuzzy match: ", best_match)
143
  for page_num in range(len(doc)): # search all pages
144
+ page = page_cache[page_num] # Use cached page
145
  text_instances = page.search_for(best_match)
146
  for inst in text_instances:
147
  results.append({"page": page_num, "bbox": inst})
 
157
  )
158
  if results: # "flattenning" the results
159
  page_num = results[0]["page"]
160
+ doc_page = page_cache[page_num] # Use cached page
161
  results = [r["bbox"] for r in results if r["page"] == page_num]
162
  else:
163
+ doc_page = page_cache[page_num] # Use cached page
164
  annotated_areas.setdefault(f"{page_num}", [])
165
  # Search for the relevant text on the page
166
  results = doc_page.search_for(text)
 
176
  )
177
  if not results:
178
  best_match = get_best_fuzzy_match(
179
+ page_text_cache[page_num], change # Use cached text
180
  )
181
  if best_match and len(best_match) > 0:
182
  results = doc_page.search_for(best_match)
scripts/llm_nlp_preprocessing.py CHANGED
@@ -14,8 +14,8 @@ api_key = os.getenv("OPENAI_API_KEY")
14
  openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
15
 
16
 
17
- def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
18
- result = call_nlp_service({"text": text}, "preprocess_text_with_nlp_llm")
19
  return result["chunks"], result["preprocessed_data"]
20
 
21
 
@@ -64,7 +64,7 @@ async def detect_regulatory_changes(text_content, subtitle):
64
  """
65
 
66
  # Preprocess text with enhanced NLP
67
- chunks, preprocessed_data = preprocess_text_with_nlp(text_content)
68
 
69
  # Classify changes using NLP insights
70
  results = await search_for_regulatory_changes(chunks, preprocessed_data, subtitle)
 
14
  openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
15
 
16
 
17
+ async def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
18
+ result = await call_nlp_service({"text": text}, "preprocess_text_with_nlp_llm")
19
  return result["chunks"], result["preprocessed_data"]
20
 
21
 
 
64
  """
65
 
66
  # Preprocess text with enhanced NLP
67
+ chunks, preprocessed_data = await preprocess_text_with_nlp(text_content)
68
 
69
  # Classify changes using NLP insights
70
  results = await search_for_regulatory_changes(chunks, preprocessed_data, subtitle)
scripts/pymupdf_nlp_preprocessing.py CHANGED
@@ -14,9 +14,9 @@ api_key = os.getenv("OPENAI_API_KEY")
14
  openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
15
 
16
 
17
- def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
18
  """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
19
- return call_nlp_service({"text": text}, "preprocess_text_with_nlp_pymupdf")
20
 
21
 
22
  def create_prompt_with_nlp(chunk, preprocessed_data):
@@ -26,10 +26,10 @@ def create_prompt_with_nlp(chunk, preprocessed_data):
26
  async def classify_changes_with_nlp(text_content, location_info):
27
  """Classify changes with NLP preprocessing."""
28
  # Apply NLP preprocessing
29
- preprocessed_data = preprocess_text_with_nlp(text_content)
30
 
31
  # Split into chunks (using the same method as your first experiment)
32
- result = call_nlp_service({"text": text_content}, "recursive_character_text_splitter")
33
  chunks = result["chunks"]
34
 
35
  async def process_chunk(chunk):
 
14
  openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
15
 
16
 
17
+ async def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
18
  """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
19
+ return await call_nlp_service({"text": text}, "preprocess_text_with_nlp_pymupdf")
20
 
21
 
22
  def create_prompt_with_nlp(chunk, preprocessed_data):
 
26
  async def classify_changes_with_nlp(text_content, location_info):
27
  """Classify changes with NLP preprocessing."""
28
  # Apply NLP preprocessing
29
+ preprocessed_data = await preprocess_text_with_nlp(text_content)
30
 
31
  # Split into chunks (using the same method as your first experiment)
32
+ result = await call_nlp_service({"text": text_content}, "recursive_character_text_splitter")
33
  chunks = result["chunks"]
34
 
35
  async def process_chunk(chunk):
scripts/utility_functions.py CHANGED
@@ -3,6 +3,11 @@ import html
3
  import os
4
  import json
5
  import re
 
 
 
 
 
6
  import pymupdf
7
  import pymupdf4llm
8
  from rapidfuzz import fuzz
@@ -15,6 +20,7 @@ from scripts.regulatory_change_foundation import (
15
  BASE_PROMPT_TEMPLATE,
16
  )
17
 
 
18
  # Define hex colors as RGB tuples (0–1 range)
19
  color_mapping_old = {
20
  "addition": (0, 0.4, 0), # green
@@ -118,7 +124,15 @@ def get_tooltip_text(change):
118
  def highlight_nth(text, change, skip_failed=False):
119
  n = change.occurrence_index if hasattr(change, "occurrence_index") else 0
120
  target = re.sub(r"\\\s+", r".*?", change.text)
121
- matches = list(re.finditer(target, text, flags=re.IGNORECASE | re.DOTALL))
 
 
 
 
 
 
 
 
122
  if len(matches) > n:
123
  match = matches[n]
124
  start, end = match.start(), match.end()
@@ -244,15 +258,59 @@ def save_json_to_file(data, output_dir, output_file):
244
  print(f"JSON data saved successfully at: {file_path}")
245
 
246
 
247
- def call_nlp_service(payload, method):
248
- url = f"https://amougou-fortiss-nlp-preprocessor.hf.space/{method}"
 
 
 
 
 
 
249
 
250
- # Make the request
251
- response = requests.post(url, data=payload)
252
- if response.status_code == 200:
253
- return response.json()
254
- else:
255
- raise Exception(f"NLP service error: {response.status_code} - {response.text}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
 
258
  def lerp_color(value, start_color=(255, 0, 0), end_color=(0, 255, 0)):
 
3
  import os
4
  import json
5
  import re
6
+ import time
7
+ import random
8
+ import asyncio
9
+ import httpx
10
+ from dotenv import load_dotenv
11
  import pymupdf
12
  import pymupdf4llm
13
  from rapidfuzz import fuzz
 
20
  BASE_PROMPT_TEMPLATE,
21
  )
22
 
23
+ load_dotenv()
24
  # Define hex colors as RGB tuples (0–1 range)
25
  color_mapping_old = {
26
  "addition": (0, 0.4, 0), # green
 
124
  def highlight_nth(text, change, skip_failed=False):
125
  n = change.occurrence_index if hasattr(change, "occurrence_index") else 0
126
  target = re.sub(r"\\\s+", r".*?", change.text)
127
+
128
+ # OPTIMIZATION: Compile regex once and find only up to n+1 matches (early exit)
129
+ pattern = re.compile(target, flags=re.IGNORECASE | re.DOTALL)
130
+ matches = []
131
+ for match in pattern.finditer(text):
132
+ matches.append(match)
133
+ if len(matches) > n: # Early exit - we have enough matches
134
+ break
135
+
136
  if len(matches) > n:
137
  match = matches[n]
138
  start, end = match.start(), match.end()
 
258
  print(f"JSON data saved successfully at: {file_path}")
259
 
260
 
261
+ MICROSERVICE_KEY = os.getenv("MICROSERVICE_KEY")
262
+ nlp_semaphore = asyncio.Semaphore(100) # Limit to 100 concurrent requests
263
+ timeout = httpx.Timeout(
264
+ connect=20.0, # time to establish connection
265
+ read=60.0, # time to read the response
266
+ write=30.0, # time to send the request
267
+ pool=80.0, # time to acquire a connection from the pool
268
+ )
269
 
270
+
271
+ async def call_nlp_service(payload, method, max_retries=5, base_delay=1.0):
272
+ url = f"https://amougou-fortiss-nlp-preprocessor.hf.space/{method}"
273
+ headers = {"Authorization": f"Bearer {MICROSERVICE_KEY}"}
274
+
275
+ async with nlp_semaphore:
276
+ for attempt in range(max_retries):
277
+ try:
278
+ async with httpx.AsyncClient(timeout=timeout) as client:
279
+ response = await client.post(url, data=payload, headers=headers)
280
+
281
+ # Success
282
+ if response.status_code == 200:
283
+ return response.json()
284
+
285
+ # Rate limited
286
+ if response.status_code == 429:
287
+ if attempt == max_retries - 1:
288
+ break
289
+ retry_after = response.headers.get("Retry-After")
290
+ delay = (
291
+ float(retry_after)
292
+ if retry_after
293
+ else (base_delay * (2**attempt) + random.uniform(0, 0.5))
294
+ )
295
+ await asyncio.sleep(delay)
296
+ continue
297
+
298
+ # Other HTTP errors
299
+ raise Exception(
300
+ f"NLP service error: {response.status_code} - {response.text}"
301
+ )
302
+
303
+ except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.NetworkError) as e:
304
+ # Retry on network issues
305
+ if attempt == max_retries - 1:
306
+ raise Exception(
307
+ f"NLP service network error after {max_retries} attempts: {e}"
308
+ )
309
+ delay = base_delay * (2**attempt) + random.uniform(0, 0.5)
310
+ await asyncio.sleep(delay)
311
+ continue
312
+
313
+ raise Exception(f"NLP service error: failed after {max_retries} retries")
314
 
315
 
316
  def lerp_color(value, start_color=(255, 0, 0), end_color=(0, 255, 0)):